xenova / whisper-web

ML-powered speech recognition directly in your browser

Home Page:https://hf.co/spaces/Xenova/whisper-web

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Vanilla JS example

flatsiedatsie opened this issue · comments

It would be great if there was a minimal vanilla JS example of how to use this in a project. I don't use React, so the current output of npm install is very difficult to extract any understanding from.

Check out the whisper-tiny model card for example usage of the model: https://huggingface.co/Xenova/whisper-tiny.en

You'll need to implement the additional UI features and functionality yourself (e.g., mic input), though. But there are resources online to assist you!

Example: Transcribe English.

// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';

let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';

// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url);
// { text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country." }

Example: Transcribe English w/ timestamps.

// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';

let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';

// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url, { return_timestamps: true });
// {
//   text: " And so my fellow Americans ask not what your country can do for you, ask what you can do for your country."
//   chunks: [
//     { timestamp: [0, 8],  text: " And so my fellow Americans ask not what your country can do for you" }
//     { timestamp: [8, 11], text: " ask what you can do for your country." }
//   ]
// }

Example: Transcribe English w/ word-level timestamps.

// npm i @xenova/transformers
import { pipeline } from '@xenova/transformers';

let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';

// Create translation pipeline
let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
let output = await transcriber(url, { return_timestamps: 'word' });
// {
//   "text": " And so my fellow Americans ask not what your country can do for you ask what you can do for your country.",
//   "chunks": [
//     { "text": " And", "timestamp": [0, 0.78] },
//     { "text": " so", "timestamp": [0.78, 1.06] },
//     { "text": " my", "timestamp": [1.06, 1.46] },
//     ...
//     { "text": " for", "timestamp": [9.72, 9.92] },
//     { "text": " your", "timestamp": [9.92, 10.22] },
//     { "text": " country.", "timestamp": [10.22, 13.5] }
//   ]
// }

Thanks!

In the end I extracted the necessary part from the React version.

whisper_worker.js

import { pipeline,env } from './js/transformers.js';
console.log("WHISPER WEB WORKER EXISTS");

env.allowLocalModels = false;

addEventListener('message', async (event) => {
	console.log("WHISPER WEB WORKER: RECEIVED MESSAGE");
	console.log("WHISPER WEB WORKER: event.data: ", event.data);
	
    const message = event.data;
	let task = message.task;

    // Do some work...
    // TODO use message data
	try{
		
	    let transcript = await transcribe(
	        message.task.recorded_audio,
	        message.model,
	        message.multilingual,
	        message.quantized,
	        message.subtask,
	        message.language,
	    );
		console.log("WHISPER WEB WORKER: TRANSCRIPTION RESULT: ", transcript);
	    if (transcript === null){
	    	console.error("WHISPER WEB WORKER: transcription was null");
	    }
	    if (typeof transcript === 'undefined'){
	    	console.error("WHISPER WEB WORKER: transcription was undefined??");
	    }

		delete task.recorded_audio;
		task['transcript'] = transcript;
	    // Send the result back to the main thread
	    self.postMessage({
			task: task,
	        status: "complete",
	        //task: "automatic-speech-recognition",
	        transcript: transcript,
	    });
		
	}catch(e){
		console.error("ERROR: whisper worker: ", e);
	}
    

});



// Define model factories
// Ensures only one model is created of each type

class PipelineFactory {
    static task = null;
    static model = null;
    static quantized = null;
    static instance = null;

    constructor(tokenizer, model, quantized) {
        this.tokenizer = tokenizer;
        this.model = model;
        this.quantized = quantized;
    }

    static async getInstance(progress_callback = null) {
        if (this.instance === null) {
            this.instance = pipeline(this.task, this.model, {
                quantized: this.quantized,
                progress_callback,

                // For medium models, we need to load the `no_attentions` revision to avoid running out of memory
                revision: this.model.includes("/whisper-medium") ? "no_attentions" : "main"
            });
        }

        return this.instance;
    }
}


class AutomaticSpeechRecognitionPipelineFactory extends PipelineFactory {
    static task = "automatic-speech-recognition";
    static model = null;
    static quantized = null;
}





const transcribe = async (
    audio,
    model,
    multilingual,
    quantized,
    subtask,
    language,
) => {
	console.log("in transcribe. audio: ", audio);
	console.log("whisper web worker: in transcribe.  model,multilingual,quantized,subtask,language: ", model, multilingual, quantized, subtask, language);
    

	let output = null;

	try{
		const isDistilWhisper = model.startsWith("distil-whisper/");
		
	    let modelName = model;
	    if (!isDistilWhisper && !multilingual) {
	        modelName += ".en"
	    }
		
	    const p = AutomaticSpeechRecognitionPipelineFactory;
	    if (p.model !== modelName || p.quantized !== quantized) {
	        // Invalidate model if different
	        p.model = modelName;
	        p.quantized = quantized;

	        if (p.instance !== null) {
	            (await p.getInstance()).dispose();
	            p.instance = null;
	        }
	    }
		
	    // Load transcriber model
	    let transcriber = await p.getInstance((data) => {
			console.log("whisper web worker: posting something back: ", data);
	        self.postMessage(data);
	    });

	    const time_precision =
	        transcriber.processor.feature_extractor.config.chunk_length /
	        transcriber.model.config.max_source_positions;

	    // Storage for chunks to be processed. Initialise with an empty chunk.
	    let chunks_to_process = [
	        {
	            tokens: [],
	            finalised: false,
	        },
	    ];

	    // TODO: Storage for fully-processed and merged chunks
	    // let decoded_chunks = [];
		
		
		
	    function chunk_callback(chunk) {
			console.log("in whisper chunk callback. chunk: ", chunk);
	        let last = chunks_to_process[chunks_to_process.length - 1];

	        // Overwrite last chunk with new info
	        Object.assign(last, chunk);
	        last.finalised = true;

	        // Create an empty chunk after, if it not the last chunk
	        if (!chunk.is_last) {
	            chunks_to_process.push({
	                tokens: [],
	                finalised: false,
	            });
	        }
	    }
		
		
		
	    // Inject custom callback function to handle merging of chunks
	    function callback_function(item) {
			//console.log("whisper_worker: COMPLETE?  item: ", item);
	        let last = chunks_to_process[chunks_to_process.length - 1];

	        // Update tokens of last chunk
	        last.tokens = [...item[0].output_token_ids];

	        // Merge text chunks
	        // TODO optimise so we don't have to decode all chunks every time
	        let data = transcriber.tokenizer._decode_asr(chunks_to_process, {
	            time_precision: time_precision,
	            return_timestamps: true,
	            force_full_sequences: false,
	        });

	        self.postMessage({
	            status: "update",
	            //task: "automatic-speech-recognition",
	            data: data,
	        });
	    }
		
	    // Actually run transcription
	    output = await transcriber(audio, {
			
	        // Greedy
	        top_k: 0,
	        do_sample: false,

	        // Sliding window
	        chunk_length_s: isDistilWhisper ? 20 : 30,
	        stride_length_s: isDistilWhisper ? 3 : 5,

	        // Language and task
	        language: language,
	        task: subtask,

	        // Return timestamps
	        return_timestamps: true,
	        force_full_sequences: false,

	        // Callback functions
	        callback_function: callback_function, // after each generation step
	        chunk_callback: chunk_callback, // after each chunk is processed
	    })

		.catch((error) => {
			console.error("ERROR, actually running whisper failed");

	        return null;
	    });
		
		console.log("beyond WHISPER transcribe. output: ", output);
		
	}
	catch(e){
		console.error("Whisper worker: error in transcribe function: ", e);
	}
	

    return output;
};


Which I start with this code in a JS module:


window.whisper_worker = null;
window.whisper_worker_busy = false;
let whisper_worker_error_count = 0;







//
//   WHISPER
//

function create_whisper_worker(){
	console.log("in create_whisper_worker");
	
	window.whisper_worker = null;
	window.whisper_worker = new Worker('./whisper_worker.js', {
	  type: 'module'
	});

	console.log("whisper_module: window.whisper_worker: ", window.whisper_worker);
	
	window.whisper_worker.addEventListener('message', e => {
		//console.log("whisper_module: received message from whisper_worker: ", e.data);

		
		if(typeof e.data.status == 'string'){
			if(e.data.status == 'progress'){
				//console.log("whisper worker sent download percentage: ", e.data.progress);
				let whisper_progress_el = document.getElementById('download-progress-whisper');
				if(whisper_progress_el == null){
					console.error("whisper (down)load progress element is missing");
					add_chat_message("whisper",'download_progress#setting---');
				}
				else{
					//console.log("updating whisper (down)load progress");
					whisper_progress_el.value = e.data.progress / 100;
				}
				
			}
			else if(e.data.status == 'ready'){
				console.log("whisper worker sent ready message");
				window.whisper_worker_busy = false;
				add_chat_message("whisper",get_translation('Voice_recognition_has_loaded'));
				let whisper_progress_el = document.getElementById('download-progress-whisper');
				if(whisper_progress_el){
					whisper_progress_el.classList.add('download-complete-chat-message');
				}
				else{
					console.error("whisper became ready, but cannot find loading progress indicator element");
				}
			}
			else if(e.data.status == 'initiate'){
				console.log("whisper worker sent initiate message");
			}
			else if(e.data.status == 'download'){
				console.log("whisper worker sent download message");
				add_chat_message("whisper","(down)loading: " + e.data.file);
			}
			
			else if(e.data.status == 'update'){
				if(typeof e.data.data == 'object' && e.data.data != null && e.data.data.length){
					set_chat_status(e.data.data[0],2);
				}
				
			}
			
			else if(e.data.status == 'complete'){
				window.whisper_worker_busy = false;
				console.log('GOT WHISPER COMPLETE.  e.data: ', e.data);
				console.log('GOT WHISPER COMPLETE.  e.data.transcript: ', e.data.transcript);
				console.log('GOT WHISPER COMPLETE.  e.data.task: ', e.data.task);
				
				if(e.data.transcript == null){
					console.warn("whisper recognition failed. If this is the first run, that's normal.");
					set_state(LISTENING);
				}
				else if(typeof e.data.transcript != 'undefined'){
					console.log("whisper returned transcription text: ", e.data.transcript);
					
					if(Array.isArray(e.data.transcript)){
						console.log("typeof transcription is array");
					}
					else if(typeof e.data.transcript == 'object'){
						if(typeof e.data.transcript.text == 'string'){
							console.log("GOT TEXT: ", e.data.transcript.text);
						}
					}
				}
				else{
					console.log("transcript was not in whisper e.data");
				}
				
				//add_chat_message("whisper","(down)loading: " + e.data.file);
			}
			else{
				console.log("whisper worker sent a content message");
				window.whisper_worker_busy = false;
				
				if(e.data.data == null){
					console.warn("whisper recognition failed. If this is the first run, that's normal.");
					set_state(LISTENING);
				}
			}
		}
			
			if(window.enable_microphone == false){
				console.log("whisper worker returned audio file, but in the meantime enable_microphone was disabled. Throwing away the data.");
			}
			else{
				
				/*
			
				if(window.whisper_queue.length){
					console.log("whisper worker done, but there is more work to do. Sentences still in whisper_queue: ", window.whisper_queue.length);
					let next_sentence = window.whisper_queue[0][0] + window.whisper_queue[0][1]; // sentence plus punctuation mark
					window.whisper_queue.splice(0,1);
				
				
					whisper_worker.postMessage({'whisper_counter':window.whisper_counter,'sentence':next_sentence});
					window.whisper_counter++;
				}
				else{
					console.log("whisper worker was done, and there are no more sentences in the whisper queue. Worker is now idle.");
					window.whisper_worker_busy = false;
				}
				*/
			}
	
	});


	window.whisper_worker.addEventListener('error', (error) => {
		console.error("ERROR: whisper_worker sent error. terminating!. Error was: ", error, error.message);
		whisper_worker_error_count++;
		
		window.whisper_worker.terminate();
		window.whisper_worker_busy = false;
		if(typeof error != 'undefined' && whisper_worker_error_count < 10){
			setTimeout(() => {
				console.log("attempting to restart whisper worker");
				create_whisper_worker();
			},1000);
		}
		else{
			console.error("whisper_worker errored out");
		}
	});
}

// create whisper worker
create_whisper_worker();


//
//  Send audio buffer to whisper worker
//
function do_whisper_web(task,language=null){
	console.log("in do_whisper_web. task: ", task);
	
	if(window.whisper_worker_busy){
		console.error("do_whisper_web was called while whisper worker was busy. Aborting.");
		return
	}
	
	if(typeof task.recorded_audio == 'undefined'){
		console.error("do_whisper_web: task did not contain recorded_audio. Aborting.");
		return
	}
	
	task.state == 'stt_in_progress';
	
	let multilingual = false;
	if(typeof language == 'string'){
		if(language != 'en'){
			multilingual = true;
		}
	}
	const quantized = false;
	const model = "Xenova/whisper-tiny";
	
	const subtask = null;
	
	console.log("do_whisper_web: sending audio to whisper worker: ", task.recorded_audio);
	
    window.whisper_worker.postMessage({
        task:task,
        model,
        multilingual,
        quantized,
        subtask: multilingual ? subtask : null,
        language:
            multilingual && language !== "auto" ? language : null,
    });

}
window.do_whisper_web = do_whisper_web;


I had a similar desire to use this in vanilla javascript form and had extracted some code from the react version to give it a try, but this version is much nicer. thanks for posting it! A rough version of whisper + vanilla js is in the repo:
https://github.com/vital-ai/vital-stt-js
i wanted to test this out in combination with a browser based wake word detector, which I posted here:
https://github.com/chat-ai-app/chat-ai-assistant-demo
with webapp posted here:
https://demo-voice.chat.ai/