|
Up
|
|
|
|
|
benchmarking-cli.md
|
|
|
|
|
benchmarking-dashboard.md
|
|
|
|
|
benchmarking-sweeps.md
|
|
|
|
|
cli-bench-latency.md
|
|
|
|
|
cli-bench-serve.md
|
|
|
|
|
cli-bench-sweep-plot-pareto.md
|
|
|
|
|
cli-bench-sweep-plot.md
|
|
|
|
|
cli-bench-sweep-serve-sla.md
|
|
|
|
|
cli-bench-sweep-serve.md
|
|
|
|
|
cli-bench-throughput.md
|
|
|
|
|
cli-chat.md
|
|
|
|
|
cli-complete.md
|
|
|
|
|
cli-json-tipinc.md
|
|
|
|
|
cli-run-batch.md
|
|
|
|
|
cli-serve.md
|
|
|
|
|
community-meetups.md
|
|
|
|
|
community-sponsors.md
|
|
|
|
|
configuration-conserving-memory.md
|
|
|
|
|
configuration-engine-args.md
|
|
|
|
|
configuration-env-vars.md
|
|
|
|
|
configuration-model-resolution.md
|
|
|
|
|
configuration-optimization.md
|
|
|
|
|
configuration-serve-args.md
|
|
|
|
|
contributing-ci-failures.md
|
|
|
|
|
contributing-ci-nightly-builds.md
|
|
|
|
|
contributing-ci-update-pytorch-version.md
|
|
|
|
|
contributing-deprecation-policy.md
|
|
|
|
|
contributing-dockerfile-dockerfile.md
|
|
|
|
|
contributing-incremental-build.md
|
|
|
|
|
contributing-model-basic.md
|
|
|
|
|
contributing-model-multimodal.md
|
|
|
|
|
contributing-model-registration.md
|
|
|
|
|
contributing-model-tests.md
|
|
|
|
|
contributing-model-transcription.md
|
|
|
|
|
contributing-profiling.md
|
|
|
|
|
contributing-vulnerability-management.md
|
|
|
|
|
deployment-docker.md
|
|
|
|
|
deployment-frameworks-anyscale.md
|
|
|
|
|
deployment-frameworks-anything-llm.md
|
|
|
|
|
deployment-frameworks-autogen.md
|
|
|
|
|
deployment-frameworks-bentoml.md
|
|
|
|
|
deployment-frameworks-cerebrium.md
|
|
|
|
|
deployment-frameworks-chatbox.md
|
|
|
|
|
deployment-frameworks-dify.md
|
|
|
|
|
deployment-frameworks-dstack.md
|
|
|
|
|
deployment-frameworks-haystack.md
|
|
|
|
|
deployment-frameworks-helm.md
|
|
|
|
|
deployment-frameworks-hf-inference-endpoints.md
|
|
|
|
|
deployment-frameworks-litellm.md
|
|
|
|
|
deployment-frameworks-lobe-chat.md
|
|
|
|
|
deployment-frameworks-lws.md
|
|
|
|
|
deployment-frameworks-modal.md
|
|
|
|
|
deployment-frameworks-open-webui.md
|
|
|
|
|
deployment-frameworks-retrieval-augmented-generation.md
|
|
|
|
|
deployment-frameworks-skypilot.md
|
|
|
|
|
deployment-frameworks-streamlit.md
|
|
|
|
|
deployment-frameworks-triton.md
|
|
|
|
|
deployment-integrations-kaito.md
|
|
|
|
|
deployment-integrations-kserve.md
|
|
|
|
|
deployment-integrations-kthena.md
|
|
|
|
|
deployment-integrations-kubeai.md
|
|
|
|
|
deployment-integrations-kuberay.md
|
|
|
|
|
deployment-integrations-llamastack.md
|
|
|
|
|
deployment-integrations-llm-d.md
|
|
|
|
|
deployment-integrations-llmaz.md
|
|
|
|
|
deployment-integrations-production-stack.md
|
|
|
|
|
deployment-k8s.md
|
|
|
|
|
deployment-nginx.md
|
|
|
|
|
design-arch-overview.md
|
|
|
|
|
design-cuda-graphs.md
|
|
|
|
|
design-dbo.md
|
|
|
|
|
design-debug-vllm-compile.md
|
|
|
|
|
design-fused-moe-modular-kernel.md
|
|
|
|
|
design-huggingface-integration.md
|
|
|
|
|
design-hybrid-kv-cache-manager.md
|
|
|
|
|
design-io-processor-plugins.md
|
|
|
|
|
design-logits-processors.md
|
|
|
|
|
design-lora-resolver-plugins.md
|
|
|
|
|
design-metrics.md
|
|
|
|
|
design-mm-processing.md
|
|
|
|
|
design-moe-kernel-features.md
|
|
|
|
|
design-multiprocessing.md
|
|
|
|
|
design-optimization-levels.md
|
|
|
|
|
design-p2p-nccl-connector.md
|
|
|
|
|
design-paged-attention.md
|
|
|
|
|
design-plugin-system.md
|
|
|
|
|
design-prefix-caching.md
|
|
|
|
|
design-torch-compile.md
|
|
|
|
|
features-automatic-prefix-caching.md
|
|
|
|
|
features-batch-invariance.md
|
|
|
|
|
features-custom-arguments.md
|
|
|
|
|
features-custom-logitsprocs.md
|
|
|
|
|
features-disagg-encoder.md
|
|
|
|
|
features-disagg-prefill.md
|
|
|
|
|
features-interleaved-thinking.md
|
|
|
|
|
features-lora.md
|
|
|
|
|
features-mooncake-connector-usage.md
|
|
|
|
|
features-multimodal-inputs.md
|
|
|
|
|
features-nixl-connector-usage.md
|
|
|
|
|
features-prompt-embeds.md
|
|
|
|
|
features-quantization-auto-awq.md
|
|
|
|
|
features-quantization-auto-round.md
|
|
|
|
|
features-quantization-bitblas.md
|
|
|
|
|
features-quantization-bnb.md
|
|
|
|
|
features-quantization-fp8.md
|
|
|
|
|
features-quantization-gguf.md
|
|
|
|
|
features-quantization-gptqmodel.md
|
|
|
|
|
features-quantization-inc.md
|
|
|
|
|
features-quantization-int4.md
|
|
|
|
|
features-quantization-int8.md
|
|
|
|
|
features-quantization-modelopt.md
|
|
|
|
|
features-quantization-quantized-kvcache.md
|
|
|
|
|
features-quantization-quark.md
|
|
|
|
|
features-quantization-torchao.md
|
|
|
|
|
features-reasoning-outputs.md
|
|
|
|
|
features-sleep-mode.md
|
|
|
|
|
features-spec-decode.md
|
|
|
|
|
features-structured-outputs.md
|
|
|
|
|
features-tool-calling.md
|
|
|
|
|
getting-started-installation-cpu.md
|
|
|
|
|
getting-started-installation-cpuappleinc.md
|
|
|
|
|
getting-started-installation-cpuarminc.md
|
|
|
|
|
getting-started-installation-cpus390xinc.md
|
|
|
|
|
getting-started-installation-cpux86inc.md
|
|
|
|
|
getting-started-installation-devicetemplate.md
|
|
|
|
|
getting-started-installation-gpu.md
|
|
|
|
|
getting-started-installation-gpucudainc.md
|
|
|
|
|
getting-started-installation-gpurocminc.md
|
|
|
|
|
getting-started-installation-gpuxpuinc.md
|
|
|
|
|
getting-started-installation-python-env-setupinc.md
|
|
|
|
|
getting-started-quickstart.md
|
|
|
|
|
governance-collaboration.md
|
|
|
|
|
governance-committers.md
|
|
|
|
|
governance-process.md
|
|
|
|
|
llms-full.txt
|
|
|
|
|
llms.txt
|
|
|
|
|
models-extensions-fastsafetensor.md
|
|
|
|
|
models-extensions-runai-model-streamer.md
|
|
|
|
|
models-extensions-tensorizer.md
|
|
|
|
|
models-generative-models.md
|
|
|
|
|
models-hardware-supported-models-cpu.md
|
|
|
|
|
models-hardware-supported-models-xpu.md
|
|
|
|
|
models-pooling-models.md
|
|
|
|
|
models-supported-models.md
|
|
|
|
|
serving-context-parallel-deployment.md
|
|
|
|
|
serving-data-parallel-deployment.md
|
|
|
|
|
serving-distributed-troubleshooting.md
|
|
|
|
|
serving-expert-parallel-deployment.md
|
|
|
|
|
serving-integrations-langchain.md
|
|
|
|
|
serving-integrations-llamaindex.md
|
|
|
|
|
serving-offline-inference.md
|
|
|
|
|
serving-openai-compatible-server.md
|
|
|
|
|
serving-parallelism-scaling.md
|
|
|
|
|
training-rlhf.md
|
|
|
|
|
training-trl.md
|
|
|
|
|
usage-faq.md
|
|
|
|
|
usage-metrics.md
|
|
|
|
|
usage-reproducibility.md
|
|
|
|
|
usage-security.md
|
|
|
|
|
usage-troubleshooting.md
|
|
|
|
|
usage-usage-stats.md
|
|
|
|
|
usage-v1-guide.md
|
|
|
|