robust-velocity-adapter
1
—
by
AbstractPhil
Other
OTHER
New
0 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
The 155000 step version has about 158,100,000 prompt samples weight trained using the This T5-small model is fried to echo and interpolate math in complex intended ways.
Code Examples
text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()text
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
# HF Hub settings
hf_repo_id = "AbstractPhil/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok"
push_every_n_steps = 5000
# Tokenizers & frozen models
t5_tok = T5TokenizerFast.from_pretrained("t5-small")
t5_mod = T5EncoderModel.from_pretrained(
"AbstractPhil/T5-Small-Human-Attentive-Try2-Pass3"
).to(device).eval()
clip_tok = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
clip_mod = CLIPTextModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to(device).eval()
# Adapter & optimizer
adapter = RobustVelocityAdapter(out_tokens=77).to(device)
optimizer = optim.AdamW(adapter.parameters(), lr=5e-4)
# Compile models for speed
t5_mod = torch.compile(t5_mod)
clip_mod = torch.compile(clip_mod)
adapter = torch.compile(adapter)
scaler = GradScaler() # for mixed precision
# Data
dataset = ParsedMultiCharDataset("AbstractPhil/human-templated-captions-1b",
num_files=12)
loader = DataLoader(dataset,
batch_size=None,
num_workers=4,
pin_memory=True)
iterator = iter(loader)
batch_size = 256
accum_steps = 4 # effective BS = 256 * 4 = 1024
max_steps = math.ceil(dataset.total_rows / batch_size)
pbar = tqdm(total=max_steps, desc="Adapter training")
for step in range(1, max_steps+1):
# zero grads on actual step
if (step-1) % accum_steps == 0:
optimizer.zero_grad()
# 1) Collect batch
texts = []
for _ in range(batch_size):
try:
_, txt = next(iterator)
except StopIteration:
iterator = iter(loader)
_, txt = next(iterator)
texts.append(txt)
# 2) Tokenize
t5_inputs = t5_tok(texts,
padding=True,
truncation=True,
max_length=77,
return_tensors="pt").to(device)
clip_inputs = clip_tok(texts,
padding="max_length",
truncation=True,
max_length=77,
return_tensors="pt").to(device)
# 3) Forward + loss in mixed precision
with autocast():
t5_seq = t5_mod(**t5_inputs).last_hidden_state # [B,77,512]
clip_seq = clip_mod(**clip_inputs).last_hidden_state # [B,77,768]
anchor_pred, delta_pred, sigma_pred = adapter(t5_seq)
delta_target = clip_seq - anchor_pred
loss_delta = hetero_loss(delta_pred, delta_target, sigma_pred)
# cosine anchor alignment
cos_sim = nn.functional.cosine_similarity(
anchor_pred.reshape(-1,768),
clip_seq.reshape(-1,768),
dim=-1
).mean()
loss_anchor = (1 - cos_sim) * 0.1
loss = loss_delta + loss_anchor
loss = loss / accum_steps # scale for accumulation
# 4) Backward + optimizer step
scaler.scale(loss).backward()
if step % accum_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(adapter.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
pbar.update(1)
pbar.set_postfix(loss=(loss.item() * accum_steps))
# 5) Save & push every N steps
if step % push_every_n_steps == 0:
ckpt = f"/content/drive/MyDrive/t5-adapter/t5-to-vit-l-14-velocity-adapter-v3-100m-77tok_step_{step}.safetensors"
save_file(adapter.state_dict(), ckpt)
#upload_file(ckpt, ckpt, repo_id=hf_repo_id)
pbar.close()Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.