Nemotron-nano-9b-fp8

Name: Nemotron-nano-9b-fp8
Author: weathermanj

438

9.0B

1 language

—

weathermanj

Language Model

OTHER

9B params

New

438 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

21GB+ RAM

Mobile

Laptop

Server

Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

9GB+ RAM

Code Examples

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

Usagepythonvllm

from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)