instruction-data-guard
61
17
—
by
nvidia
Other
OTHER
New
61 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
Description: Instruction Data Guard is a deep-learning classification model that helps identify LLM poisoning attacks in datasets.
Code Examples
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234" # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token
class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
def __init__(self, input_dim=4096, dropout=0.7):
super().__init__()
self.input_dim = input_dim
self.dropout = Dropout(dropout)
self.sigmoid = torch.nn.Sigmoid()
self.input_layer = Linear(input_dim, input_dim)
self.hidden_layer_0 = Linear(input_dim, 2000)
self.hidden_layer_1 = Linear(2000, 500)
self.hidden_layer_2 = Linear(500, 1)
def forward(self, x):
x = torch.nn.functional.normalize(x, dim=-1)
x = self.dropout(x)
x = F.relu(self.input_layer(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_0(x))
x = self.dropout(x)
x = F.relu(self.hidden_layer_1(x))
x = self.dropout(x)
x = self.hidden_layer_2(x)
x = self.sigmoid(x)
return x
# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()
# Function to compute results
def get_instruction_data_guard_results(
prompts,
tokenizer,
model,
instruction_data_guard,
device="cuda",
):
input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
outputs = model.generate(
**input_ids,
output_hidden_states=True,
return_dict_in_generate=True,
max_new_tokens=1,
pad_token_id=0,
)
input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()
# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample = f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.