Closed
Description
const transcribe = async (
audio,
model,
multilingual,
quantized,
subtask,
language,
) => {
// TODO use subtask and language
//If multilingual is true, it adds an empty string ("") to the model name, effectively leaving it unchanged. If multilingual is false, it appends ".en" to the model name, indicating that the model should be used for English transcription.
const modelName = `Xenova/whisper-${model}${multilingual ? "" : ".en"}`;
console.log('modelName',modelName)
const p = AutomaticSpeechRecognitionPipelineFactory;
//// Check if the current model settings are different from the new settings
if (p.model !== modelName || p.quantized !== quantized) {
// Invalidate model if different
// Update the model name and quantized status
p.model = modelName;
p.quantized = quantized;
// Check if there is an existing instance of the ASR pipeline
if (p.instance !== null) {
// Dispose of the existing instance (clean up resources)
(await p.getInstance()).dispose();
// Set the instance to null (indicating that it needs to be recreated)
p.instance = null;
}
}
// Load transcriber model
let transcriber = await p.getInstance((data) => {
self.postMessage(data);
});
const time_precision =
transcriber.processor.feature_extractor.config.chunk_length /
transcriber.model.config.max_source_positions;
// Storage for chunks to be processed. Initialise with an empty chunk.
let chunks_to_process = [
{
tokens: [],
finalised: false,
},
];
// TODO: Storage for fully-processed and merged chunks
// let decoded_chunks = [];
function chunk_callback(chunk) {
let last = chunks_to_process[chunks_to_process.length - 1];
// Overwrite last chunk with new info
Object.assign(last, chunk);
last.finalised = true;
// Create an empty chunk after, if it not the last chunk
if (!chunk.is_last) {
chunks_to_process.push({
tokens: [],
finalised: false,
});
}
}
// Inject custom callback function to handle merging of chunks
function callback_function(item) {
let last = chunks_to_process[chunks_to_process.length - 1];
// Update tokens of last chunk
last.tokens = [...item[0].output_token_ids];
// Merge text chunks
// TODO optimise so we don't have to decode all chunks every time
let data = transcriber.tokenizer._decode_asr(chunks_to_process, {
time_precision: time_precision,
return_timestamps: true,
force_full_sequences: false,
});
self.postMessage({
status: "update",
task: "automatic-speech-recognition",
data: data,
});
}
// Actually run transcription
let output = await transcriber(audio, {
// Greedy
top_k: 0,
do_sample: false,
// Sliding window
chunk_length_s: 30,
stride_length_s: 5,
// Language and task
language: language,
task: subtask,
// Return timestamps
return_timestamps: true,
force_full_sequences: false,
// Callback functions
callback_function: callback_function, // after each generation step
chunk_callback: chunk_callback, // after each chunk is processed
}).catch((error) => {
self.postMessage({
status: "error",
task: "automatic-speech-recognition",
data: error,
});
return null;
});
return output;
};