apertus-pretrain-toxicity
1
9 languages
license:apache-2.0
by
swiss-ai
Other
OTHER
New
0 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Code Examples
Toxicity Scoringpythonpytorch
# Define the model with an MLP classifier on top of XLM-RoBERTa
class RobertaClassifier(nn.Module):
def __init__(self, num_classes,
model_name="FacebookAI/xlm-roberta-base",
device="cuda:0"):
super(RobertaClassifier, self).__init__()
self.roberta = RobertaModel.from_pretrained(model_name)
self.freeze_roberta_encoder()
self.device = device
self.classifier = nn.Sequential(
nn.Linear(self.roberta.config.hidden_size, self.roberta.config.hidden_size),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(self.roberta.config.hidden_size, num_classes)
)
def freeze_roberta_encoder(self):
for param in self.roberta.parameters():
param.requires_grad = False
def mean_pooling(self, model_output, attention_mask):
import torch
# https://huggingface.co/aditeyabaral/sentencetransformer-xlm-roberta-base
token_embeddings = model_output.last_hidden_state # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def forward(self, input_ids=None, attention_mask=None,
roberta_embeddings=None):
# outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
# pooled_output = outputs.last_hidden_state[:, 0] # CLS token representation
if roberta_embeddings is None:
outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
roberta_embeddings = self.mean_pooling(outputs, attention_mask)
logits = self.classifier(roberta_embeddings)
return torch.nn.functional.softmax(logits, dim=1)
def predict(self, input_ids=None, attention_mask=None,
roberta_embeddings=None, **kwargs):
"""
Predicts class labels for a list of texts.
Args:
texts (list of str): The input sentences to classify.
max_length (int): Maximum sequence length for tokenization.
Returns:
list of int: Predicted class labels for each input text.
"""
self.eval()
with torch.no_grad():
if roberta_embeddings is None:
logits = self(input_ids, attention_mask)
else:
logits = self(roberta_embeddings=roberta_embeddings)
return logits[:,1].cpu().numpy()python
LANGUAGE = "english" # choose from ["english", "chinese", "french", "german", "italian", "spanish", "portuguese", "polish", "dutch"]
MODEL_PATH = f"{MODEL_DIR}/{LANGUAGE}.pth"
DEVICE = "cpu"
model = RobertaClassifier(device=DEVICE, num_classes=2)
model.load_state_dict(state_dict=torch.load(MODEL_PATH, map_location=torch.device(DEVICE)))
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
document = ["I want to predict the toxicity score of this document: I am happy today.",
"I want to predict the toxicity score of this document: this is a violent content!!"]
inputs = tokenizer(document, return_tensors="pt", padding=True, truncation=True, max_length=512)
model.predict(**inputs) # scores: [0.00121997, 0.9723031]Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.