bybert-jp-400m
2
1 language
llama_enc
by
tohoku-nlp
Code Model
OTHER
New
2 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
Llama アーキテクチャをベースとし、Causal Attention Mask を取り除くことで、Encoder 型言語モデルとして利用しています。 具体的には、以下のモジュールを採用しています。 - SwiGLU - Rotary Positional Embeddings (RoPE) - Grouped Q...
Code Examples
利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)利用方法pythontransformers
import argparse
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
MASK_PLACEHOLDER = "<mask>"
SAMPLE_INPUT_TEXTS = [
f"東北大学は宮城県{MASK_PLACEHOLDER * 6}市にある大学です。", # 6 bytes mask
f"日本一高い山は{MASK_PLACEHOLDER * 9}です。", # 9 bytes mask
]
def main(args):
torch.manual_seed(args.seed)
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
)
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
dtype=torch.bfloat16,
trust_remote_code=True,
)
model.to(device)
model.eval()
input_texts = [
s.replace(MASK_PLACEHOLDER, tokenizer.mask_token)
for s in SAMPLE_INPUT_TEXTS
]
batch = tokenizer(input_texts, return_tensors="pt", padding="longest")
batch = batch.to(device)
outputs = model(**batch)
decoded_ids = torch.argmax(outputs.logits, dim=-1)
is_pad = batch.input_ids == tokenizer.pad_token_id
decoded_ids[is_pad] = tokenizer.pad_token_id
decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=False)
for input_ids, decoded_text in zip(batch.input_ids, decoded_texts):
input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
print("===")
print(f"Input: {input_text}")
print(f"Decoded: {decoded_text}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--model_name_or_path",
"-m",
type=str,
default="tohoku-nlp/bybert-jp-400m",
help="Path to the model or model identifier from huggingface.co/models."
)
parser.add_argument("--seed", "-s", type=int, help="Random seed", default=42)
args = parser.parse_args()
main(args)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.