GLM-4.5-FP8
9.2K
75
3 languages
license:mit
by
zai-org
Language Model
OTHER
New
9K downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
📍 Use GLM-4.5 API services on Z.ai API Platform (Global) or Zhipu AI Open Platform (Mainland China) . We present GLM-4.5, an open-source Mixture-of-Experts (M...
Code Examples
Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Transformers Inferencepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_id = "zai-org/GLM-4.5-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Adjust as needed (e.g., torch.float8 for FP8 models)
low_cpu_mem_usage=True,
device_map="auto",
trust_remote_code=True
)
model.eval()
messages = [
{"role": "user", "content": "Hello, how are you?"},
]
# Example for non-thinking mode (direct response)
# The `add_nothink_token=True` parameter triggers non-thinking mode.
# This mode is suitable for straightforward questions not requiring complex reasoning or tool usage.
inputs_nothink_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=True)
input_ids_nothink = tokenizer(inputs_nothink_text, return_tensors="pt").input_ids.to(model.device)
outputs_nothink = model.generate(input_ids_nothink, max_new_tokens=100)
print("Non-thinking mode response:", tokenizer.decode(outputs_nothink[0][len(input_ids_nothink[0]):], skip_special_tokens=True))
# Example for thinking mode (for complex reasoning or tool usage)
# By default, `add_nothink_token=False` or omitting it triggers thinking mode.
# This mode allows the model to perform multi-step reasoning, break down tasks, and utilize tools.
inputs_think_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, add_nothink_token=False)
input_ids_think = tokenizer(inputs_think_text, return_tensors="pt").input_ids.to(model.device)
outputs_think = model.generate(input_ids_think, max_new_tokens=100)
print("Thinking mode response:", tokenizer.decode(outputs_think[0][len(input_ids_think[0]):], skip_special_tokens=True))Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Citationbibtex
@article{zhu2025glm45,
title={GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
author={Zhu, Xiaohan and Sun, Tianxiang and Wang, Hao and Xu, Yi and Zhang, Yichen and Wang, Junyi and Huang, Junjie and Zeng, Jiao and Huang, Yangyang and Gu, Ruipeng and Zhang, Xiaodong and Du, Mengying and Han, Hao and Li, Chao and Xiao, Jin and Guo, Weidong and Li, Zhen and Lu, Jingkang and Chen, Shu and Chen, Huadong and Chen, Peng and Liu, Hongguang and Guo, Guang and Liu, Wen and Yang, Tianyu and Hu, Bo and Zhang, Wenmin and Sun, Maosong},
journal={arXiv preprint arXiv:2508.06471},
year={2025}
}Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.