wav2vec2-large-xls-r-300m-Urdu

603.2K
13
302M
1 language
license:apache-2.0
by
kingabzpro
Audio Model
OTHER
0B params
Good
603K downloads
Production-ready
Edge AI:
Mobile
Laptop
Server
1GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
1GB+ RAM

Code Examples

Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))
Inference With LMpythontransformers
# pip install transformers datasets pyctcdecode kenlm huggingface_hub torch

import json, torch
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download

mid = "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
proc = AutoProcessor.from_pretrained(mid)
model = AutoModelForCTC.from_pretrained(mid).eval().to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

kenlm = hf_hub_download(mid, "language_model/5gram.bin")
uni  = hf_hub_download(mid, "language_model/unigrams.txt")
try: attrs = json.load(open(hf_hub_download(mid, "language_model/attrs.json"), encoding="utf-8"))
except: attrs = {}

v = proc.tokenizer.get_vocab()
id2tok = [t for t,i in sorted(v.items(), key=lambda x:x[1])]
blank = proc.tokenizer.pad_token_id; wdt = proc.tokenizer.word_delimiter_token
keep, labels = zip(*[
    (i, "" if i==blank else " " if t==wdt else t)
    for i,t in enumerate(id2tok) if (i==blank or t==wdt or len(t)==1)
])

dec = build_ctcdecoder(list(labels), kenlm_model_path=kenlm,
                       unigrams=open(uni,encoding="utf-8").read().splitlines())
dec.alpha, dec.beta = attrs.get("alpha",0.5), attrs.get("beta",1.0)

ds = load_dataset("mozilla-foundation/common_voice_22_0", "ur", split="test", streaming=True)
ex = next(iter(ds.cast_column("audio", Audio(sampling_rate=16_000))))
x = proc(ex["audio"]["array"], sampling_rate=16_000, return_tensors="pt").input_values.to(model.device)

with torch.no_grad():
    logits = model(x).logits[0].cpu().numpy()[:, keep]
print(dec.decode(logits))

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.