Nemotron-nano-9b-fp8

438
6
9.0B
1 language
by
weathermanj
Language Model
OTHER
9B params
New
438 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
21GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
9GB+ RAM

Code Examples

Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Usagepythonvllm
from vllm import LLM, SamplingParams

# Load the FP8 quantized model
model = LLM(
    model="weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    dtype="auto"  # Will auto-detect FP8 format
)

# Generate with streaming support
prompts = ["Explain the benefits of hybrid Mamba-Transformer architectures."]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=256
)

outputs = model.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Alternative: Transformerspythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
    "weathermanj/nemotron-nano-9b-fp8",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("weathermanj/nemotron-nano-9b-fp8")

# Generate text
inputs = tokenizer("How does FP8 quantization improve AI efficiency?", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=200, do_sample=True, temperature=0.7)
    
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Citationbibtex
@software{nemotron_fp8_quantized,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8: Efficient FP8 Quantization},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/nvidia-nemotron-nano-9b-v2-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Model Lineagetext
nvidia/NVIDIA-Nemotron-Nano-9B-v2 (Base Model)
    ↓ (FP8 Quantization by jwjohns)
weathermanj/nvidia-nemotron-nano-9b-v2-fp8 (This Model)
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}
Usage Trackingbibtex
@software{nvidia_nemotron_fp8,
  title={NVIDIA-Nemotron-Nano-9B-v2-FP8},
  author={jwjohns},
  organization={Emendat.io},
  year={2025},
  url={https://huggingface.co/weathermanj/Nemotron-nano-9b-fp8},
  note={FP8 quantized version of NVIDIA Nemotron-Nano-9B-v2},
  baseModel={nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

@article{nvidia2024nemotron,
  title={Nemotron-4 Technical Report},
  author={NVIDIA},
  year={2024},
  url={https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2}
}

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.