Qwen2.5-1.5B-FP8-dynamic
4
license:apache-2.0
by
RedHatAI
Language Model
OTHER
1.5B params
New
4 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
4GB+ RAM
Mobile
Laptop
Server
Quick Summary
Model Overview - Model Architecture: Qwen2 - Input: Text - Output: Text - Model Optimizations: - Activation quantization: INT8 - Weight quantization: INT8 - Int...
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2GB+ RAM
Code Examples
Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)Deploymentpythontransformers
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_id = "neuralmagic/Qwen2.5-1.5B-FP8-dynamic"
number_gpus = 1
max_model_len = 8192
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Give me a short introduction to large language model."
llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)textvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autotextvllm
lm_eval \
--model vllm \
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-FP8-dynamic",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
--tasks openllm \
--batch_size autoDeploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.