instruction-data-guard

61
17
by
nvidia
Other
OTHER
New
61 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary

Description: Instruction Data Guard is a deep-learning classification model that helps identify LLM poisoning attacks in datasets.

Code Examples

Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]
Initialize model embedded with AEGISpythontransformers
import torch
import torch.nn.functional as F
from huggingface_hub import PyTorchModelHubMixin
from peft import PeftModel
from torch.nn import Dropout, Linear
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model embedded with AEGIS
pretrained_model_name_or_path = "meta-llama/LlamaGuard-7b"
dtype = torch.bfloat16
token = "hf_1234"  # Replace with your user access token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype, token=token).to(device)
peft_model_name_or_path = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
model = PeftModel.from_pretrained(base_model, peft_model_name_or_path)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    padding_side="left"
)
tokenizer.pad_token = tokenizer.unk_token

class InstructionDataGuardNet(torch.nn.Module, PyTorchModelHubMixin):
    def __init__(self, input_dim=4096, dropout=0.7):
        super().__init__()
        self.input_dim = input_dim
        self.dropout = Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        self.input_layer = Linear(input_dim, input_dim)

        self.hidden_layer_0 = Linear(input_dim, 2000)
        self.hidden_layer_1 = Linear(2000, 500)
        self.hidden_layer_2 = Linear(500, 1)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=-1)
        x = self.dropout(x)
        x = F.relu(self.input_layer(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_0(x))
        x = self.dropout(x)
        x = F.relu(self.hidden_layer_1(x))
        x = self.dropout(x)
        x = self.hidden_layer_2(x)
        x = self.sigmoid(x)
        return x

# Load Instruction Data Guard classifier
instruction_data_guard = InstructionDataGuardNet.from_pretrained("nvidia/instruction-data-guard")
instruction_data_guard = instruction_data_guard.to(device)
instruction_data_guard = instruction_data_guard.eval()

# Function to compute results
def get_instruction_data_guard_results(
    prompts,
    tokenizer,
    model,
    instruction_data_guard,
    device="cuda",
):
    input_ids = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        output_hidden_states=True,
        return_dict_in_generate=True,
        max_new_tokens=1,
        pad_token_id=0,
    )
    input_tensor = outputs.hidden_states[0][32][:, -1,:].to(torch.float)
    return instruction_data_guard(input_tensor).flatten().detach().cpu().numpy()

# Prepare sample input
instruction = "Find a route between San Diego and Phoenix which passes through Nevada"
input_ = ""
response = "Drive to Las Vegas with highway 15 and from there drive to Phoenix with highway 93"
benign_sample =  f"Instruction: {instruction}. Input: {input_}. Response: {response}."
text_samples = [benign_sample]
poisoning_scores = get_instruction_data_guard_results(
    text_samples, tokenizer, model, instruction_data_guard
)
print(poisoning_scores)
# [0.01149639]

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.