vllm.envs ¶
   CMAKE_BUILD_TYPE  module-attribute  ¶
 CMAKE_BUILD_TYPE: (
    Literal["Debug", "Release", "RelWithDebInfo"] | None
) = None
  VLLM_ALL2ALL_BACKEND  module-attribute  ¶
 VLLM_ALL2ALL_BACKEND: Literal[
    "naive",
    "pplx",
    "deepep_high_throughput",
    "deepep_low_latency",
    "allgather_reducescatter",
    "flashinfer_all2allv",
] = "allgather_reducescatter"
  VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE  module-attribute  ¶
 VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = (
    False
)
  VLLM_ALLOW_INSECURE_SERIALIZATION  module-attribute  ¶
 VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
  VLLM_DEBUG_LOG_API_SERVER_RESPONSE  module-attribute  ¶
 VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
  VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE  module-attribute  ¶
 VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False
  VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK  module-attribute  ¶
 VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK: bool = True
  VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL  module-attribute  ¶
 VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False
  VLLM_DEEP_GEMM_WARMUP  module-attribute  ¶
 VLLM_DEEP_GEMM_WARMUP: Literal["skip", "full", "relax"] = (
    "relax"
)
  VLLM_DISABLE_SHARED_EXPERTS_STREAM  module-attribute  ¶
 VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
  VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING  module-attribute  ¶
 VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
  VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING  module-attribute  ¶
 VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True
  VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION  module-attribute  ¶
 VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
  VLLM_FLASHINFER_MOE_BACKEND  module-attribute  ¶
 VLLM_FLASHINFER_MOE_BACKEND: Literal[
    "throughput", "latency"
] = "throughput"
  VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH  module-attribute  ¶
 VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
  VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS  module-attribute  ¶
 VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
  VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS  module-attribute  ¶
    VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES  module-attribute  ¶
 VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
  VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE  module-attribute  ¶
 VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
  VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME  module-attribute  ¶
 VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = (
    "VLLM_OBJECT_STORAGE_SHM_BUFFER"
)
  VLLM_RAY_DP_PACK_STRATEGY  module-attribute  ¶
 VLLM_RAY_DP_PACK_STRATEGY: Literal[
    "strict", "fill", "span"
] = "strict"
  VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16  module-attribute  ¶
 VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
  VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB  module-attribute  ¶
 VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None
  VLLM_ROCM_QUICK_REDUCE_QUANTIZATION  module-attribute  ¶
 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal[
    "FP", "INT8", "INT6", "INT4", "NONE"
] = "NONE"
  VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS  module-attribute  ¶
 VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
  VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION  module-attribute  ¶
 VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
  VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL  module-attribute  ¶
 VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
  VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY  module-attribute  ¶
 VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
  VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS  module-attribute  ¶
 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
  VLLM_TORCH_PROFILER_RECORD_SHAPES  module-attribute  ¶
 VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
  VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY  module-attribute  ¶
 VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
  VLLM_USE_FLASHINFER_MOE_MXFP4_BF16  module-attribute  ¶
 VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
  VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8  module-attribute  ¶
 VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
  VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS  module-attribute  ¶
 VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
  VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE  module-attribute  ¶
 VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal[
    "auto", "nccl", "shm"
] = "auto"
  VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM  module-attribute  ¶
 VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
  VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL  module-attribute  ¶
 VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
  VLLM_V1_USE_PREFILL_DECODE_ATTENTION  module-attribute  ¶
 VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
  VLLM_WORKER_MULTIPROC_METHOD  module-attribute  ¶
 VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = (
    "fork"
)
  VLLM_XLA_CACHE_PATH  module-attribute  ¶
 VLLM_XLA_CACHE_PATH: str = join(
    VLLM_CACHE_ROOT, "xla_cache"
)
  environment_variables  module-attribute  ¶
 environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_TARGET_DEVICE": lambda: lower(),
    "VLLM_MAIN_CUDA_VERSION": lambda: lower() or "12.8",
    "MAX_JOBS": lambda: getenv("MAX_JOBS", None),
    "NVCC_THREADS": lambda: getenv("NVCC_THREADS", None),
    "VLLM_USE_PRECOMPILED": lambda: lower() in ("1", "true")
    or bool(get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
    "VLLM_DOCKER_BUILD_CONTEXT": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
        int(
            getenv(
                "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL",
                "0",
            )
        )
    ),
    "CMAKE_BUILD_TYPE": env_with_choices(
        "CMAKE_BUILD_TYPE",
        None,
        ["Debug", "Release", "RelWithDebInfo"],
    ),
    "VERBOSE": lambda: bool(int(getenv("VERBOSE", "0"))),
    "VLLM_CONFIG_ROOT": lambda: expanduser(
        getenv(
            "VLLM_CONFIG_ROOT",
            join(get_default_config_root(), "vllm"),
        )
    ),
    "VLLM_CACHE_ROOT": lambda: expanduser(
        getenv(
            "VLLM_CACHE_ROOT",
            join(get_default_cache_root(), "vllm"),
        )
    ),
    "VLLM_HOST_IP": lambda: getenv("VLLM_HOST_IP", ""),
    "VLLM_PORT": get_vllm_port,
    "VLLM_RPC_BASE_PATH": lambda: getenv(
        "VLLM_RPC_BASE_PATH", gettempdir()
    ),
    "VLLM_USE_MODELSCOPE": lambda: lower() == "true",
    "VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
        get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
    ),
    "CUDA_HOME": lambda: get("CUDA_HOME", None),
    "VLLM_NCCL_SO_PATH": lambda: get(
        "VLLM_NCCL_SO_PATH", None
    ),
    "LD_LIBRARY_PATH": lambda: get("LD_LIBRARY_PATH", None),
    "VLLM_USE_TRITON_FLASH_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: lower()
    in ("true", "1"),
    "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
        get("VLLM_FLASH_ATTN_VERSION", None)
    ),
    "VLLM_USE_STANDALONE_COMPILE": lambda: get(
        "VLLM_USE_STANDALONE_COMPILE", "1"
    )
    == "1",
    "VLLM_PATTERN_MATCH_DEBUG": lambda: get(
        "VLLM_PATTERN_MATCH_DEBUG", None
    ),
    "VLLM_DEBUG_DUMP_PATH": lambda: get(
        "VLLM_DEBUG_DUMP_PATH", None
    ),
    "VLLM_USE_AOT_COMPILE": use_aot_compile,
    "VLLM_FORCE_AOT_LOAD": lambda: get(
        "VLLM_FORCE_AOT_LOAD", "0"
    )
    == "1",
    "LOCAL_RANK": lambda: int(get("LOCAL_RANK", "0")),
    "CUDA_VISIBLE_DEVICES": lambda: get(
        "CUDA_VISIBLE_DEVICES", None
    ),
    "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
        get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
    ),
    "VLLM_API_KEY": lambda: get("VLLM_API_KEY", None),
    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: lower()
    == "true",
    "S3_ACCESS_KEY_ID": lambda: get(
        "S3_ACCESS_KEY_ID", None
    ),
    "S3_SECRET_ACCESS_KEY": lambda: get(
        "S3_SECRET_ACCESS_KEY", None
    ),
    "S3_ENDPOINT_URL": lambda: get("S3_ENDPOINT_URL", None),
    "VLLM_USAGE_STATS_SERVER": lambda: get(
        "VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
    ),
    "VLLM_NO_USAGE_STATS": lambda: get(
        "VLLM_NO_USAGE_STATS", "0"
    )
    == "1",
    "VLLM_DISABLE_FLASHINFER_PREFILL": lambda: get(
        "VLLM_DISABLE_FLASHINFER_PREFILL", "0"
    )
    == "1",
    "VLLM_DO_NOT_TRACK": lambda: (
        get("VLLM_DO_NOT_TRACK", None)
        or get("DO_NOT_TRACK", None)
        or "0"
    )
    == "1",
    "VLLM_USAGE_SOURCE": lambda: get(
        "VLLM_USAGE_SOURCE", "production"
    ),
    "VLLM_CONFIGURE_LOGGING": lambda: int(
        getenv("VLLM_CONFIGURE_LOGGING", "1")
    ),
    "VLLM_LOGGING_CONFIG_PATH": lambda: getenv(
        "VLLM_LOGGING_CONFIG_PATH"
    ),
    "VLLM_LOGGING_LEVEL": lambda: upper(),
    "VLLM_LOGGING_STREAM": lambda: getenv(
        "VLLM_LOGGING_STREAM", "ext://sys.stdout"
    ),
    "VLLM_LOGGING_PREFIX": lambda: getenv(
        "VLLM_LOGGING_PREFIX", ""
    ),
    "VLLM_LOG_STATS_INTERVAL": lambda: val
    if (
        val := (
            float(getenv("VLLM_LOG_STATS_INTERVAL", "10."))
        )
    )
    > 0.0
    else 10.0,
    "VLLM_TRACE_FUNCTION": lambda: int(
        getenv("VLLM_TRACE_FUNCTION", "0")
    ),
    "VLLM_ATTENTION_BACKEND": env_with_choices(
        "VLLM_ATTENTION_BACKEND", None, lambda: list(keys())
    ),
    "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
        int(environ["VLLM_USE_FLASHINFER_SAMPLER"])
    )
    if "VLLM_USE_FLASHINFER_SAMPLER" in environ
    else None,
    "VLLM_PP_LAYER_PARTITION": lambda: getenv(
        "VLLM_PP_LAYER_PARTITION", None
    ),
    "VLLM_CPU_KVCACHE_SPACE": lambda: int(
        getenv("VLLM_CPU_KVCACHE_SPACE", "0")
    )
    if "VLLM_CPU_KVCACHE_SPACE" in environ
    else None,
    "VLLM_CPU_OMP_THREADS_BIND": lambda: getenv(
        "VLLM_CPU_OMP_THREADS_BIND", "auto"
    ),
    "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
        getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
    )
    if "VLLM_CPU_NUM_OF_RESERVED_CPU" in environ
    else None,
    "VLLM_CPU_MOE_PREPACK": lambda: bool(
        int(getenv("VLLM_CPU_MOE_PREPACK", "1"))
    ),
    "VLLM_CPU_SGL_KERNEL": lambda: bool(
        int(getenv("VLLM_CPU_SGL_KERNEL", "0"))
    ),
    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": env_with_choices(
        "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE",
        "auto",
        ["auto", "nccl", "shm"],
    ),
    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
        int(
            getenv(
                "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM",
                "0",
            )
        )
    ),
    "VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool(
        int(getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))
    ),
    "VLLM_WORKER_MULTIPROC_METHOD": env_with_choices(
        "VLLM_WORKER_MULTIPROC_METHOD",
        "fork",
        ["spawn", "fork"],
    ),
    "VLLM_ASSETS_CACHE": lambda: expanduser(
        getenv(
            "VLLM_ASSETS_CACHE",
            join(
                get_default_cache_root(), "vllm", "assets"
            ),
        )
    ),
    "VLLM_ASSETS_CACHE_MODEL_CLEAN": lambda: bool(
        int(getenv("VLLM_ASSETS_CACHE_MODEL_CLEAN", "0"))
    ),
    "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")
    ),
    "VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
    ),
    "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
    ),
    "VLLM_MEDIA_URL_ALLOW_REDIRECTS": lambda: bool(
        int(getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1"))
    ),
    "VLLM_MEDIA_LOADING_THREAD_COUNT": lambda: int(
        getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")
    ),
    "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int(
        getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25")
    ),
    "VLLM_VIDEO_LOADER_BACKEND": lambda: getenv(
        "VLLM_VIDEO_LOADER_BACKEND", "opencv"
    ),
    "VLLM_MM_INPUT_CACHE_GIB": lambda: int(
        getenv("VLLM_MM_INPUT_CACHE_GIB", "4")
    ),
    "VLLM_XLA_CACHE_PATH": lambda: expanduser(
        getenv(
            "VLLM_XLA_CACHE_PATH",
            join(
                get_default_cache_root(),
                "vllm",
                "xla_cache",
            ),
        )
    ),
    "VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
        int(getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
    ),
    "VLLM_XLA_USE_SPMD": lambda: bool(
        int(getenv("VLLM_XLA_USE_SPMD", "0"))
    ),
    "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
        getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
    ),
    "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool(
        int(
            getenv(
                "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING",
                "1",
            )
        )
    ),
    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
        getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
    ),
    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_FORCE_FP8_MARLIN": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_FORCE_LOAD_FORMAT": lambda: getenv(
        "VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
    ),
    "VLLM_RPC_TIMEOUT": lambda: int(
        getenv("VLLM_RPC_TIMEOUT", "10000")
    ),
    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
        get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
    ),
    "VLLM_PLUGINS": lambda: None
    if "VLLM_PLUGINS" not in environ
    else split(","),
    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: getenv(
        "VLLM_LORA_RESOLVER_CACHE_DIR", None
    ),
    "VLLM_TORCH_PROFILER_DIR": lambda: None
    if getenv("VLLM_TORCH_PROFILER_DIR", None) is None
    else abspath(
        expanduser(getenv("VLLM_TORCH_PROFILER_DIR", "."))
    ),
    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool(
        getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0")
        != "0"
    ),
    "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool(
        getenv(
            "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0"
        )
        != "0"
    ),
    "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool(
        getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"
    ),
    "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
        getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
    ),
    "VLLM_USE_TRITON_AWQ": lambda: bool(
        int(getenv("VLLM_USE_TRITON_AWQ", "0"))
    ),
    "VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: lower()
    in ("1", "true"),
    "VLLM_SKIP_P2P_CHECK": lambda: getenv(
        "VLLM_SKIP_P2P_CHECK", "1"
    )
    == "1",
    "VLLM_DISABLED_KERNELS": lambda: []
    if "VLLM_DISABLED_KERNELS" not in environ
    else split(","),
    "VLLM_DISABLE_PYNCCL": lambda: lower() in ("true", "1"),
    "VLLM_USE_V1": lambda: bool(
        int(getenv("VLLM_USE_V1", "1"))
    ),
    "VLLM_ROCM_USE_AITER": lambda: lower() in ("true", "1"),
    "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_LINEAR": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MOE": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_RMSNORM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MLA": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MHA": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_TRITON_ROPE": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_FP8BMM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_SKINNY_GEMM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_FP8_PADDING": lambda: bool(
        int(getenv("VLLM_ROCM_FP8_PADDING", "1"))
    ),
    "VLLM_ROCM_MOE_PADDING": lambda: bool(
        int(getenv("VLLM_ROCM_MOE_PADDING", "1"))
    ),
    "VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": env_with_choices(
        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
        "NONE",
        ["FP", "INT8", "INT6", "INT4", "NONE"],
    ),
    "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": lambda: maybe_convert_int(
        get(
            "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None
        )
    ),
    "Q_SCALE_CONSTANT": lambda: int(
        getenv("Q_SCALE_CONSTANT", "200")
    ),
    "K_SCALE_CONSTANT": lambda: int(
        getenv("K_SCALE_CONSTANT", "200")
    ),
    "V_SCALE_CONSTANT": lambda: int(
        getenv("V_SCALE_CONSTANT", "100")
    ),
    "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
        int(getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
    ),
    "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
        getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
    ),
    "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
        int(getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
    ),
    "VLLM_SERVER_DEV_MODE": lambda: bool(
        int(getenv("VLLM_SERVER_DEV_MODE", "0"))
    ),
    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
        getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
    ),
    "VLLM_MLA_DISABLE": lambda: bool(
        int(getenv("VLLM_MLA_DISABLE", "0"))
    ),
    "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(
        getenv(
            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
            "32",
        )
    ),
    "VLLM_RAY_PER_WORKER_GPUS": lambda: float(
        getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
    ),
    "VLLM_RAY_BUNDLE_INDICES": lambda: getenv(
        "VLLM_RAY_BUNDLE_INDICES", ""
    ),
    "VLLM_CUDART_SO_PATH": lambda: getenv(
        "VLLM_CUDART_SO_PATH", None
    ),
    "VLLM_DP_RANK": lambda: int(
        getenv("VLLM_DP_RANK", "0")
    ),
    "VLLM_DP_RANK_LOCAL": lambda: int(
        getenv("VLLM_DP_RANK_LOCAL", VLLM_DP_RANK)
    ),
    "VLLM_DP_SIZE": lambda: int(
        getenv("VLLM_DP_SIZE", "1")
    ),
    "VLLM_DP_MASTER_IP": lambda: getenv(
        "VLLM_DP_MASTER_IP", "127.0.0.1"
    ),
    "VLLM_DP_MASTER_PORT": lambda: int(
        getenv("VLLM_DP_MASTER_PORT", "0")
    ),
    "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(
        getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")
    ),
    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: get(
        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
    )
    == "1",
    "VLLM_RAY_DP_PACK_STRATEGY": lambda: getenv(
        "VLLM_RAY_DP_PACK_STRATEGY", "strict"
    ),
    "VLLM_CI_USE_S3": lambda: get("VLLM_CI_USE_S3", "0")
    == "1",
    "VLLM_MODEL_REDIRECT_PATH": lambda: get(
        "VLLM_MODEL_REDIRECT_PATH", None
    ),
    "VLLM_MARLIN_USE_ATOMIC_ADD": lambda: get(
        "VLLM_MARLIN_USE_ATOMIC_ADD", "0"
    )
    == "1",
    "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
        get("VLLM_MXFP4_USE_MARLIN", None)
    ),
    "VLLM_V1_USE_OUTLINES_CACHE": lambda: get(
        "VLLM_V1_USE_OUTLINES_CACHE", "0"
    )
    == "1",
    "VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
        environ["VLLM_TPU_BUCKET_PADDING_GAP"]
    )
    if "VLLM_TPU_BUCKET_PADDING_GAP" in environ
    else 0,
    "VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(
        get("VLLM_TPU_MOST_MODEL_LEN", None)
    ),
    "VLLM_TPU_USING_PATHWAYS": lambda: bool(
        "proxy" in lower()
    ),
    "VLLM_USE_DEEP_GEMM": lambda: bool(
        int(getenv("VLLM_USE_DEEP_GEMM", "1"))
    ),
    "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool(
        int(getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))
    ),
    "VLLM_DEEP_GEMM_WARMUP": env_with_choices(
        "VLLM_DEEP_GEMM_WARMUP",
        "relax",
        ["skip", "full", "relax"],
    ),
    "VLLM_USE_FUSED_MOE_GROUPED_TOPK": lambda: bool(
        int(getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))
    ),
    "VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
        int(getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0"))
    ),
    "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(
        int(getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))
    ),
    "VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool(
        int(getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))
    ),
    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool(
        int(
            getenv(
                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"
            )
        )
    ),
    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool(
        int(
            getenv(
                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
                "0",
            )
        )
    ),
    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
        int(
            getenv(
                "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"
            )
        )
    ),
    "VLLM_XGRAMMAR_CACHE_MB": lambda: int(
        getenv("VLLM_XGRAMMAR_CACHE_MB", "512")
    ),
    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
        getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
    ),
    "VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
        int(
            getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
        )
    ),
    "VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: getenv(
        "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
    ),
    "VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
        getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5600")
    ),
    "VLLM_ALL2ALL_BACKEND": env_with_choices(
        "VLLM_ALL2ALL_BACKEND",
        "allgather_reducescatter",
        [
            "naive",
            "pplx",
            "deepep_high_throughput",
            "deepep_low_latency",
            "allgather_reducescatter",
            "flashinfer_all2allv",
        ],
    ),
    "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
        "VLLM_FLASHINFER_MOE_BACKEND",
        "throughput",
        ["throughput", "latency"],
    ),
    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
        getenv(
            "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840"
        )
    ),
    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": lambda: loads(
        getenv(
            "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB",
            "{}",
        )
    ),
    "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: lower(),
    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
        getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
    ),
    "VLLM_SLEEP_WHEN_IDLE": lambda: bool(
        int(getenv("VLLM_SLEEP_WHEN_IDLE", "0"))
    ),
    "VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
        getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
    ),
    "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": lambda: int(
        getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")
    ),
    "VLLM_KV_CACHE_LAYOUT": env_with_choices(
        "VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"]
    ),
    "VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
        int(getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
    ),
    "VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool(
        int(getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))
    ),
    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int(
        getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")
    ),
    "VLLM_USE_CUDNN_PREFILL": lambda: bool(
        int(getenv("VLLM_USE_CUDNN_PREFILL", "0"))
    ),
    "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool(
        int(
            getenv(
                "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
                "0",
            )
        )
    ),
    "VLLM_USE_TRTLLM_ATTENTION": lambda: None
    if "VLLM_USE_TRTLLM_ATTENTION" not in environ
    else lower() in ("1", "true"),
    "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool(
        int(
            getenv(
                "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
                "0",
            )
        )
    ),
    "VLLM_HAS_FLASHINFER_CUBIN": lambda: getenv(
        "VLLM_HAS_FLASHINFER_CUBIN", False
    ),
    "VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
        "VLLM_NVFP4_GEMM_BACKEND",
        None,
        [
            "flashinfer-cudnn",
            "flashinfer-trtllm",
            "flashinfer-cutlass",
        ],
    ),
    "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(
        int(getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))
    ),
    "VLLM_LOOPBACK_IP": lambda: getenv(
        "VLLM_LOOPBACK_IP", ""
    ),
    "VLLM_PROCESS_NAME_PREFIX": lambda: getenv(
        "VLLM_PROCESS_NAME_PREFIX", "VLLM"
    ),
    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
        int(
            getenv(
                "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE",
                "0",
            )
        )
    ),
    "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool(
        int(getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))
    ),
    "VLLM_ROCM_FP8_MFMA_PAGE_ATTN": lambda: bool(
        int(getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0"))
    ),
    "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
        int(getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))
    ),
    "VLLM_TUNED_CONFIG_FOLDER": lambda: getenv(
        "VLLM_TUNED_CONFIG_FOLDER", None
    ),
    "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS": env_set_with_choices(
        "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS",
        default=[],
        choices=[
            "container",
            "code_interpreter",
            "web_search_preview",
        ],
    ),
    "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool(
        int(
            getenv(
                "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS",
                "0",
            )
        )
    ),
    "VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY": lambda: bool(
        int(
            getenv(
                "VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY", "0"
            )
        )
    ),
    "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
        int(getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
    ),
    "VLLM_NVTX_SCOPES_FOR_PROFILING": lambda: bool(
        int(getenv("VLLM_NVTX_SCOPES_FOR_PROFILING", "0"))
    ),
    "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": lambda: bool(
        int(
            getenv(
                "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"
            )
        )
    ),
    "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: getenv(
        "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME",
        "VLLM_OBJECT_STORAGE_SHM_BUFFER",
    ),
    "VLLM_DEEPEP_BUFFER_SIZE_MB": lambda: int(
        getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")
    ),
    "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool(
        int(
            getenv(
                "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE",
                "0",
            )
        )
    ),
    "VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK": lambda: bool(
        int(
            getenv(
                "VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "1"
            )
        )
    ),
    "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool(
        int(
            getenv("VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", "0")
        )
    ),
    "VLLM_DBO_COMM_SMS": lambda: int(
        getenv("VLLM_DBO_COMM_SMS", "20")
    ),
    "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE": lambda: bool(
        int(
            getenv("VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", "1")
        )
    ),
    "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool(
        int(
            getenv(
                "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
                "1",
            )
        )
    ),
    "VLLM_USE_NCCL_SYMM_MEM": lambda: bool(
        int(getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))
    ),
    "VLLM_NCCL_INCLUDE_PATH": lambda: get(
        "VLLM_NCCL_INCLUDE_PATH", None
    ),
    "VLLM_USE_FBGEMM": lambda: bool(
        int(getenv("VLLM_USE_FBGEMM", "0"))
    ),
    "VLLM_GC_DEBUG": lambda: getenv("VLLM_GC_DEBUG", ""),
    "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: getenv(
        "VLLM_DISABLE_SHARED_EXPERTS_STREAM", False
    ),
}
  __dir__ ¶
     __getattr__ ¶
 __getattr__(name: str)
Gets environment variables lazily.
NOTE: After enable_envs_cache() invocation (which triggered after service initialization), all environment variables will be cached.
Source code in vllm/envs.py
   compute_hash ¶
 compute_hash() -> str
WARNING: Whenever a new key is added to this environment variables, ensure that it is included in the factors list if it affects the computation graph. For example, different values of VLLM_PP_LAYER_PARTITION will generate different computation graphs, so it is included in the factors list. The env vars that affect the choice of different kernels or attention backends should also be included in the factors list.
Source code in vllm/envs.py
 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599  |  | 
  enable_envs_cache ¶
  Enables caching of environment variables. This is useful for performance reasons, as it avoids the need to re-evaluate environment variables on every call.
NOTE: Currently, it's invoked after service initialization to reduce runtime overhead. This also means that environment variables should NOT be updated after the service is initialized.
Source code in vllm/envs.py
   env_list_with_choices ¶
 env_list_with_choices(
    env_name: str,
    default: list[str],
    choices: list[str] | Callable[[], list[str]],
    case_sensitive: bool = True,
) -> Callable[[], list[str]]
Create a lambda that validates environment variable containing comma-separated values against allowed choices
Parameters:
| Name | Type | Description | Default | 
|---|---|---|---|
 env_name  |   str  |    Name of the environment variable  |  required | 
 default  |   list[str]  |    Default list of values if not set  |  required | 
 choices  |   list[str] | Callable[[], list[str]]  |    List of valid string options or callable that returns list  |  required | 
 case_sensitive  |   bool  |    Whether validation should be case sensitive  |   True  |  
Returns:
| Type | Description | 
|---|---|
 Callable[[], list[str]]  |    Lambda function for environment_variables  |  
 Callable[[], list[str]]  |    dict that returns list of strings  |  
Source code in vllm/envs.py
   env_set_with_choices ¶
 env_set_with_choices(
    env_name: str,
    default: list[str],
    choices: list[str] | Callable[[], list[str]],
    case_sensitive: bool = True,
) -> Callable[[], set[str]]
Creates a lambda which that validates environment variable containing comma-separated values against allowed choices which returns choices as a set.
Source code in vllm/envs.py
   env_with_choices ¶
 env_with_choices(
    env_name: str,
    default: str | None,
    choices: list[str] | Callable[[], list[str]],
    case_sensitive: bool = True,
) -> Callable[[], str | None]
Create a lambda that validates environment variable against allowed choices
Parameters:
| Name | Type | Description | Default | 
|---|---|---|---|
 env_name  |   str  |    Name of the environment variable  |  required | 
 default  |   str | None  |    Default value if not set (can be None)  |  required | 
 choices  |   list[str] | Callable[[], list[str]]  |    List of valid string options or callable that returns list  |  required | 
 case_sensitive  |   bool  |    Whether validation should be case sensitive  |   True  |  
Returns:
| Type | Description | 
|---|---|
 Callable[[], str | None]  |    Lambda function for environment_variables dict  |  
Source code in vllm/envs.py
   get_default_cache_root ¶
     get_default_config_root ¶
     get_vllm_port ¶
 get_vllm_port() -> int | None
Get the port from VLLM_PORT environment variable.
Returns:
| Type | Description | 
|---|---|
 int | None  |    The port number as an integer if VLLM_PORT is set, None otherwise.  |  
Raises:
| Type | Description | 
|---|---|
 ValueError  |    If VLLM_PORT is a URI, suggest k8s service discovery issue.  |