Cosmobillian

10 models • 1 total models in database

Sort by:

radiologist_llama_v3

samsung_innovation_campus_radiology_model

Turkish_llasa_tts

- Developed by: Cosmobillian - License: apache-2.0 - Finetuned from model : unsloth/llasa-1b This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.

llama

turkish_orpheus_tts

- Developed by: Cosmobillian - License: apache-2.0 - Finetuned from model : Karayakar/Orpheus-TTS-Turkish-PT-5000 This llama model was trained 2x faster with Unsloth and Huggingface's TRL library. inference.py (please install the necessary libraries)pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 pip install snac pathlib torch transformers huggingfacehub librosa numpy scipy torchaudio Flask jsonify import os from snac import SNAC from pathlib import Path import torch from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer,BitsAndBytesConfig from huggingfacehub import snapshotdownload import librosa import numpy as np from scipy.io.wavfile import write import torchaudio from flask import Flask, jsonify, request def loadorpheustokenizer(modelid: str = modelLocalPath) -> AutoTokenizer: tokenizer = AutoTokenizer.frompretrained(modelid,localfilesonly=True, devicemap="cuda") return tokenizer def loadsnac(): snacmodel = SNAC.frompretrained("hubertsiuzdak/snac24khz") return snacmodel def loadorpheusautomodel(modelid: str = modelLocalPath): model = AutoModelForCausalLM.frompretrained(modelid, torchdtype=torch.bfloat16,localfilesonly=True, devicemap="cuda") model.cuda() return model def tokenizeaudio(audiofilepath, snacmodel): audioarray, samplerate = librosa.load(audiofilepath, sr=24000) waveform = torch.fromnumpy(audioarray).unsqueeze(0) waveform = waveform.to(dtype=torch.float32) with torch.inferencemode(): codes = snacmodel.encode(waveform) allcodes = [] for i in range(codes[0].shape[1]): allcodes.append(codes[0][0][i].item() + 128266) allcodes.append(codes[1][0][2 i].item() + 128266 + 4096) allcodes.append(codes[2][0][4 i].item() + 128266 + (2 4096)) allcodes.append(codes[2][0][(4 i) + 1].item() + 128266 + (3 4096)) allcodes.append(codes[1][0][(2 i) + 1].item() + 128266 + (4 4096)) allcodes.append(codes[2][0][(4 i) + 2].item() + 128266 + (5 4096)) allcodes.append(codes[2][0][(4 i) + 3].item() + 128266 + (6 4096)) def prepareinputs( fpathaudioref, audioreftranscript: str, textprompts: list[str], snacmodel, tokenizer, ): starttokens = torch.tensor([[128259]], dtype=torch.int64) endtokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64) finaltokens = torch.tensor([[128258, 128262]], dtype=torch.int64) allmodifiedinputids = [] for prompt in textprompts: inputids = tokenizer(prompt, returntensors="pt").inputids #secondinputids = torch.cat([zeropromptinputids, starttokens, inputids, endtokens], dim=1) secondinputids = torch.cat([starttokens, inputids, endtokens], dim=1) allmodifiedinputids.append(secondinputids) allpaddedtensors = [] allattentionmasks = [] maxlength = max([modifiedinputids.shape[1] for modifiedinputids in allmodifiedinputids]) for modifiedinputids in allmodifiedinputids: padding = maxlength - modifiedinputids.shape[1] paddedtensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modifiedinputids], dim=1) attentionmask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modifiedinputids.shape[1]), dtype=torch.int64)], dim=1) allpaddedtensors.append(paddedtensor) allattentionmasks.append(attentionmask) allpaddedtensors = torch.cat(allpaddedtensors, dim=0) allattentionmasks = torch.cat(allattentionmasks, dim=0) inputids = allpaddedtensors.to("cuda") attentionmask = allattentionmasks.to("cuda") return inputids, attentionmask def inference(model, inputids, attentionmask): with torch.nograd(): generatedids = model.generate( inputids=inputids, attentionmask=attentionmask, maxnewtokens=2048, dosample=True, temperature=0.2, topk=10, topp=0.9, repetitionpenalty=1.9, numreturnsequences=1, eostokenid=128258, generatedids = torch.cat([generatedids, torch.tensor([[128262]]).to("cuda")], dim=1) # EOAI def converttokenstospeech(generatedids, snacmodel): tokentofind = 128257 tokentoremove = 128258 tokenindices = (generatedids == tokentofind).nonzero(astuple=True) if len(tokenindices[1]) > 0: lastoccurrenceidx = tokenindices[1][-1].item() croppedtensor = generatedids[:, lastoccurrenceidx + 1:] else: croppedtensor = generatedids mask = croppedtensor != tokentoremove processedrows = [] for row in croppedtensor: maskedrow = row[row != tokentoremove] processedrows.append(maskedrow) codelists = [] for row in processedrows: rowlength = row.size(0) newlength = (rowlength // 7) 7 trimmedrow = row[:newlength] trimmedrow = [t - 128266 for t in trimmedrow] codelists.append(trimmedrow) mysamples = [] for codelist in codelists: samples = redistributecodes(codelist, snacmodel) mysamples.append(samples) def redistributecodes(codelist, snacmodel): layer1 = [] layer2 = [] layer3 = [] for i in range((len(codelist) + 1) // 7): layer1.append(codelist[7 i]) layer2.append(codelist[7 i + 1] - 4096) layer3.append(codelist[7 i + 2] - (2 4096)) layer3.append(codelist[7 i + 3] - (3 4096)) layer2.append(codelist[7 i + 4] - (4 4096)) layer3.append(codelist[7 i + 5] - (5 4096)) layer3.append(codelist[7 i + 6] - (6 4096)) codes = [ torch.tensor(layer1).unsqueeze(0), torch.tensor(layer2).unsqueeze(0), torch.tensor(layer3).unsqueeze(0) ] audiohat = snacmodel.decode(codes) return audiohat def towavfrom(samples: list) -> list[np.ndarray]: """Converts a list of PyTorch tensors (or NumPy arrays) to NumPy arrays.""" processedsamples = [] for s in samples: if isinstance(s, torch.Tensor): s = s.detach().squeeze().to('cpu').numpy() else: s = np.squeeze(s) def zeroshottts(fpathaudioref, audioreftranscript, texts: list[str], model, snacmodel, tokenizer): print(f"fpathaudioref {fpathaudioref}") print(f"audioreftranscript {audioreftranscript}") print(f"texts {texts}") inpids, attnmask = prepareinputs(fpathaudioref, audioreftranscript, texts, snacmodel, tokenizer) print(f"inputidlen:{len(inpids)}") genids = inference(model, inpids, attnmask) samples = converttokenstospeech(genids, snacmodel) wavforms = towavfrom(samples) return wavforms def savewav(samples: list[np.array], samplerate: int, filenames: list[str]): """ Saves a list of tensors as .wav files. Args: samples (list[torch.Tensor]): List of audio tensors. samplerate (int): Sample rate in Hz. filenames (list[str]): List of filenames to save. """ wavdata = towavfrom(samples) for data, filename in zip(wavdata, filenames): write(filename, samplerate, data.astype(np.float32)) print(f"saved to {filename}") def getrefaudioandtranscript(rootfolder: str): rootpath = Path(rootfolder) print(f"rootpath {rootpath}") out = [] for speakerfolder in rootpath.iterdir(): if speakerfolder.isdir(): # Ensure it's a directory wavfiles = list(speakerfolder.glob(".wav")) txtfiles = list(speakerfolder.glob(".txt")) if wavfiles and txtfiles: refaudio = wavfiles[0] # Assume only one .wav file per folder transcript = txtfiles[0].readtext(encoding="utf-8").strip() out.append((refaudio, transcript)) @app.route('/generate', methods=['POST']) def generate(): content = request.json processdata(content) rresponse = { 'received': content, 'status': 'success' } response= jsonify(rresponse) response.headers['Content-Type'] = 'application/json; charset=utf-8' return response def processdata(jsonText): texts = [f"{jsonText['text']}"] #print(f"texts:{texts}") #print(f"promptpairs:{promptpairs}") for fpathaudio, audiotranscript in promptpairs: print(f"zero shot: {fpathaudio} {audiotranscript}") wavforms = zeroshottts(fpathaudio, audiotranscript, texts, model, snacmodel, tokenizer) import os from pathlib import Path from datetime import datetime outdir = Path(fpathaudio).parent / "inference" #print(f"outdir:{outdir}") outdir.mkdir(parents=True, existok=True) # timestampstr = str(int(datetime.now().timestamp())) filenames = [f"{outdir.asposix()}/{Path(fpathaudio).stem}{i}{timestampstr}.wav" for i, t in enumerate(texts)] #print(f"filenames:{filenames}") savewav(wavforms, 24000, filenames) if name == "main": tokenizer = loadorpheustokenizer() model = loadorpheusautomodel() snacmodel = loadsnac() promptpairs = getrefaudioandtranscript("D:\\AIAPPS\\Orpheus-TTS\\data") print(f"snacmodel loaded") app.run(debug=True,port=5400)

llama