Vanilla JS example
flatsiedatsie opened this issue · comments
It would be great if there was a minimal vanilla JS example of how to use this in a project. I don't use React, so the current output of npm install
is very difficult to extract any understanding from.
Check out the whisper-tiny model card for example usage of the model: https://huggingface.co/Xenova/whisper-tiny.en
You'll need to implement the additional UI features and functionality yourself (e.g., mic input), though. But there are resources online to assist you!
Example: Transcribe English.
// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';
let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url);
// { text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country." }
Example: Transcribe English w/ timestamps.
// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';
let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url, { return_timestamps: true });
// {
// text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country."
// chunks: [
// { timestamp: [0, 8], text: " And so my fellow Americans ask not what your country can do for you" }
// { timestamp: [8, 11], text: " ask what you can do for your country." }
// ]
// }
Example: Transcribe English w/ word-level timestamps.
// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';
let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url, { return_timestamps: 'word' });
// {
// "text": " And so my fellow Americans ask not what your country can do for you ask what you can do for your country.",
// "chunks": [
// { "text": " And", "timestamp": [0, 0.78] },
// { "text": " so", "timestamp": [0.78, 1.06] },
// { "text": " my", "timestamp": [1.06, 1.46] },
// ...
// { "text": " for", "timestamp": [9.72, 9.92] },
// { "text": " your", "timestamp": [9.92, 10.22] },
// { "text": " country.", "timestamp": [10.22, 13.5] }
// ]
// }
Thanks!
In the end I extracted the necessary part from the React version.
whisper_worker.js
import { pipeline,env } from './js/transformers.js';
console.log("WHISPER WEB WORKER EXISTS");
env.allowLocalModels = false;
addEventListener('message', async (event) => {
console.log("WHISPER WEB WORKER: RECEIVED MESSAGE");
console.log("WHISPER WEB WORKER: event.data: ", event.data);
const message = event.data;
let task = message.task;
// Do some work...
// TODO use message data
try{
let transcript = await transcribe(
message.task.recorded_audio,
message.model,
message.multilingual,
message.quantized,
message.subtask,
message.language,
);
console.log("WHISPER WEB WORKER: TRANSCRIPTION RESULT: ", transcript);
if (transcript === null){
console.error("WHISPER WEB WORKER: transcription was null");
}
if (typeof transcript === 'undefined'){
console.error("WHISPER WEB WORKER: transcription was undefined??");
}
delete task.recorded_audio;
task['transcript'] = transcript;
// Send the result back to the main thread
self.postMessage({
task: task,
status: "complete",
//task: "automatic-speech-recognition",
transcript: transcript,
});
}catch(e){
console.error("ERROR: whisper worker: ", e);
}
});
// Define model factories
// Ensures only one model is created of each type
class PipelineFactory {
static task = null;
static model = null;
static quantized = null;
static instance = null;
constructor(tokenizer, model, quantized) {
this.tokenizer = tokenizer;
this.model = model;
this.quantized = quantized;
}
static async getInstance(progress_callback = null) {
if (this.instance === null) {
this.instance = pipeline(this.task, this.model, {
quantized: this.quantized,
progress_callback,
// For medium models, we need to load the `no_attentions` revision to avoid running out of memory
revision: this.model.includes("/whisper-medium") ? "no_attentions" : "main"
});
}
return this.instance;
}
}
class AutomaticSpeechRecognitionPipelineFactory extends PipelineFactory {
static task = "automatic-speech-recognition";
static model = null;
static quantized = null;
}
const transcribe = async (
audio,
model,
multilingual,
quantized,
subtask,
language,
) => {
console.log("in transcribe. audio: ", audio);
console.log("whisper web worker: in transcribe. model,multilingual,quantized,subtask,language: ", model, multilingual, quantized, subtask, language);
let output = null;
try{
const isDistilWhisper = model.startsWith("distil-whisper/");
let modelName = model;
if (!isDistilWhisper && !multilingual) {
modelName += ".en"
}
const p = AutomaticSpeechRecognitionPipelineFactory;
if (p.model !== modelName || p.quantized !== quantized) {
// Invalidate model if different
p.model = modelName;
p.quantized = quantized;
if (p.instance !== null) {
(await p.getInstance()).dispose();
p.instance = null;
}
}
// Load transcriber model
let transcriber = await p.getInstance((data) => {
console.log("whisper web worker: posting something back: ", data);
self.postMessage(data);
});
const time_precision =
transcriber.processor.feature_extractor.config.chunk_length /
transcriber.model.config.max_source_positions;
// Storage for chunks to be processed. Initialise with an empty chunk.
let chunks_to_process = [
{
tokens: [],
finalised: false,
},
];
// TODO: Storage for fully-processed and merged chunks
// let decoded_chunks = [];
function chunk_callback(chunk) {
console.log("in whisper chunk callback. chunk: ", chunk);
let last = chunks_to_process[chunks_to_process.length - 1];
// Overwrite last chunk with new info
Object.assign(last, chunk);
last.finalised = true;
// Create an empty chunk after, if it not the last chunk
if (!chunk.is_last) {
chunks_to_process.push({
tokens: [],
finalised: false,
});
}
}
// Inject custom callback function to handle merging of chunks
function callback_function(item) {
//console.log("whisper_worker: COMPLETE? item: ", item);
let last = chunks_to_process[chunks_to_process.length - 1];
// Update tokens of last chunk
last.tokens = [...item[0].output_token_ids];
// Merge text chunks
// TODO optimise so we don't have to decode all chunks every time
let data = transcriber.tokenizer._decode_asr(chunks_to_process, {
time_precision: time_precision,
return_timestamps: true,
force_full_sequences: false,
});
self.postMessage({
status: "update",
//task: "automatic-speech-recognition",
data: data,
});
}
// Actually run transcription
output = await transcriber(audio, {
// Greedy
top_k: 0,
do_sample: false,
// Sliding window
chunk_length_s: isDistilWhisper ? 20 : 30,
stride_length_s: isDistilWhisper ? 3 : 5,
// Language and task
language: language,
task: subtask,
// Return timestamps
return_timestamps: true,
force_full_sequences: false,
// Callback functions
callback_function: callback_function, // after each generation step
chunk_callback: chunk_callback, // after each chunk is processed
})
.catch((error) => {
console.error("ERROR, actually running whisper failed");
return null;
});
console.log("beyond WHISPER transcribe. output: ", output);
}
catch(e){
console.error("Whisper worker: error in transcribe function: ", e);
}
return output;
};
Which I start with this code in a JS module:
window.whisper_worker = null;
window.whisper_worker_busy = false;
let whisper_worker_error_count = 0;
//
// WHISPER
//
function create_whisper_worker(){
console.log("in create_whisper_worker");
window.whisper_worker = null;
window.whisper_worker = new Worker('./whisper_worker.js', {
type: 'module'
});
console.log("whisper_module: window.whisper_worker: ", window.whisper_worker);
window.whisper_worker.addEventListener('message', e => {
//console.log("whisper_module: received message from whisper_worker: ", e.data);
if(typeof e.data.status == 'string'){
if(e.data.status == 'progress'){
//console.log("whisper worker sent download percentage: ", e.data.progress);
let whisper_progress_el = document.getElementById('download-progress-whisper');
if(whisper_progress_el == null){
console.error("whisper (down)load progress element is missing");
add_chat_message("whisper",'download_progress#setting---');
}
else{
//console.log("updating whisper (down)load progress");
whisper_progress_el.value = e.data.progress / 100;
}
}
else if(e.data.status == 'ready'){
console.log("whisper worker sent ready message");
window.whisper_worker_busy = false;
add_chat_message("whisper",get_translation('Voice_recognition_has_loaded'));
let whisper_progress_el = document.getElementById('download-progress-whisper');
if(whisper_progress_el){
whisper_progress_el.classList.add('download-complete-chat-message');
}
else{
console.error("whisper became ready, but cannot find loading progress indicator element");
}
}
else if(e.data.status == 'initiate'){
console.log("whisper worker sent initiate message");
}
else if(e.data.status == 'download'){
console.log("whisper worker sent download message");
add_chat_message("whisper","(down)loading: " + e.data.file);
}
else if(e.data.status == 'update'){
if(typeof e.data.data == 'object' && e.data.data != null && e.data.data.length){
set_chat_status(e.data.data[0],2);
}
}
else if(e.data.status == 'complete'){
window.whisper_worker_busy = false;
console.log('GOT WHISPER COMPLETE. e.data: ', e.data);
console.log('GOT WHISPER COMPLETE. e.data.transcript: ', e.data.transcript);
console.log('GOT WHISPER COMPLETE. e.data.task: ', e.data.task);
if(e.data.transcript == null){
console.warn("whisper recognition failed. If this is the first run, that's normal.");
set_state(LISTENING);
}
else if(typeof e.data.transcript != 'undefined'){
console.log("whisper returned transcription text: ", e.data.transcript);
if(Array.isArray(e.data.transcript)){
console.log("typeof transcription is array");
}
else if(typeof e.data.transcript == 'object'){
if(typeof e.data.transcript.text == 'string'){
console.log("GOT TEXT: ", e.data.transcript.text);
}
}
}
else{
console.log("transcript was not in whisper e.data");
}
//add_chat_message("whisper","(down)loading: " + e.data.file);
}
else{
console.log("whisper worker sent a content message");
window.whisper_worker_busy = false;
if(e.data.data == null){
console.warn("whisper recognition failed. If this is the first run, that's normal.");
set_state(LISTENING);
}
}
}
if(window.enable_microphone == false){
console.log("whisper worker returned audio file, but in the meantime enable_microphone was disabled. Throwing away the data.");
}
else{
/*
if(window.whisper_queue.length){
console.log("whisper worker done, but there is more work to do. Sentences still in whisper_queue: ", window.whisper_queue.length);
let next_sentence = window.whisper_queue[0][0] + window.whisper_queue[0][1]; // sentence plus punctuation mark
window.whisper_queue.splice(0,1);
whisper_worker.postMessage({'whisper_counter':window.whisper_counter,'sentence':next_sentence});
window.whisper_counter++;
}
else{
console.log("whisper worker was done, and there are no more sentences in the whisper queue. Worker is now idle.");
window.whisper_worker_busy = false;
}
*/
}
});
window.whisper_worker.addEventListener('error', (error) => {
console.error("ERROR: whisper_worker sent error. terminating!. Error was: ", error, error.message);
whisper_worker_error_count++;
window.whisper_worker.terminate();
window.whisper_worker_busy = false;
if(typeof error != 'undefined' && whisper_worker_error_count < 10){
setTimeout(() => {
console.log("attempting to restart whisper worker");
create_whisper_worker();
},1000);
}
else{
console.error("whisper_worker errored out");
}
});
}
// create whisper worker
create_whisper_worker();
//
// Send audio buffer to whisper worker
//
function do_whisper_web(task,language=null){
console.log("in do_whisper_web. task: ", task);
if(window.whisper_worker_busy){
console.error("do_whisper_web was called while whisper worker was busy. Aborting.");
return
}
if(typeof task.recorded_audio == 'undefined'){
console.error("do_whisper_web: task did not contain recorded_audio. Aborting.");
return
}
task.state == 'stt_in_progress';
let multilingual = false;
if(typeof language == 'string'){
if(language != 'en'){
multilingual = true;
}
}
const quantized = false;
const model = "Xenova/whisper-tiny";
const subtask = null;
console.log("do_whisper_web: sending audio to whisper worker: ", task.recorded_audio);
window.whisper_worker.postMessage({
task:task,
model,
multilingual,
quantized,
subtask: multilingual ? subtask : null,
language:
multilingual && language !== "auto" ? language : null,
});
}
window.do_whisper_web = do_whisper_web;
I had a similar desire to use this in vanilla javascript form and had extracted some code from the react version to give it a try, but this version is much nicer. thanks for posting it! A rough version of whisper + vanilla js is in the repo:
https://github.com/vital-ai/vital-stt-js
i wanted to test this out in combination with a browser based wake word detector, which I posted here:
https://github.com/chat-ai-app/chat-ai-assistant-demo
with webapp posted here:
https://demo-voice.chat.ai/