diff --git a/packages/components/credentials/AzureCognitiveServices.credential.ts b/packages/components/credentials/AzureCognitiveServices.credential.ts new file mode 100644 index 00000000000..66323e088a9 --- /dev/null +++ b/packages/components/credentials/AzureCognitiveServices.credential.ts @@ -0,0 +1,39 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class AzureCognitiveServices implements INodeCredential { + label: string + name: string + version: number + inputs: INodeParams[] + + constructor() { + this.label = 'Azure Cognitive Services' + this.name = 'azureCognitiveServices' + this.version = 1.0 + this.inputs = [ + { + label: 'Azure Subscription Key', + name: 'azureSubscriptionKey', + type: 'password', + description: 'Your Azure Cognitive Services subscription key' + }, + { + label: 'Service Region', + name: 'serviceRegion', + type: 'string', + description: 'The Azure service region (e.g., "westus", "eastus")', + placeholder: 'westus' + }, + { + label: 'API Version', + name: 'apiVersion', + type: 'string', + description: 'The API version to use (e.g., "2024-05-15-preview")', + placeholder: '2024-05-15-preview', + default: '2024-05-15-preview' + } + ] + } +} + +module.exports = { credClass: AzureCognitiveServices } diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts index 547804c5c3f..fbb659d54e3 100644 --- a/packages/components/src/speechToText.ts +++ b/packages/components/src/speechToText.ts @@ -3,12 +3,14 @@ import { getCredentialData } from './utils' import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai' import { AssemblyAI } from 'assemblyai' import { getFileFromStorage } from './storageUtils' +import axios from 'axios' import Groq from 'groq-sdk' const SpeechToTextType = { OPENAI_WHISPER: 'openAIWhisper', ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', LOCALAI_STT: 'localAISTT', + AZURE_COGNITIVE: 'azureCognitive', GROQ_WHISPER: 'groqWhisper' } @@ -72,6 +74,40 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi } break } + case SpeechToTextType.AZURE_COGNITIVE: { + try { + const baseUrl = `https://${credentialData.serviceRegion}.cognitiveservices.azure.com/speechtotext/transcriptions:transcribe` + const apiVersion = credentialData.apiVersion || '2024-05-15-preview' + + const formData = new FormData() + const audioBlob = new Blob([audio_file], { type: upload.type }) + formData.append('audio', audioBlob, upload.name) + + const channelsStr = speechToTextConfig.channels || '0,1' + const channels = channelsStr.split(',').map(Number) + + const definition = { + locales: [speechToTextConfig.language || 'en-US'], + profanityFilterMode: speechToTextConfig.profanityFilterMode || 'Masked', + channels + } + formData.append('definition', JSON.stringify(definition)) + + const response = await axios.post(`${baseUrl}?api-version=${apiVersion}`, formData, { + headers: { + 'Ocp-Apim-Subscription-Key': credentialData.azureSubscriptionKey, + Accept: 'application/json' + } + }) + + if (response.data && response.data.combinedPhrases.length > 0) { + return response.data.combinedPhrases[0]?.text || '' + } + return '' + } catch (error) { + throw error.response?.data || error + } + } case SpeechToTextType.GROQ_WHISPER: { const groqClient = new Groq({ apiKey: credentialData.groqApiKey diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx index 17dc132039b..145f7baa8e5 100644 --- a/packages/ui/src/ui-component/extended/SpeechToText.jsx +++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx @@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown' import openAISVG from '@/assets/images/openai.svg' import assemblyAIPng from '@/assets/images/assemblyai.png' import localAiPng from '@/assets/images/localai.png' +import azureSvg from '@/assets/images/azure_openai.svg' import groqPng from '@/assets/images/groq.png' // store @@ -31,6 +32,7 @@ const SpeechToTextType = { OPENAI_WHISPER: 'openAIWhisper', ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', LOCALAI_STT: 'localAISTT', + AZURE_COGNITIVE: 'azureCognitive', GROQ_WHISPER: 'groqWhisper' } @@ -142,6 +144,58 @@ const speechToTextProviders = { } ] }, + [SpeechToTextType.AZURE_COGNITIVE]: { + label: 'Azure Cognitive Services', + name: SpeechToTextType.AZURE_COGNITIVE, + icon: azureSvg, + url: 'https://azure.microsoft.com/en-us/products/cognitive-services/speech-services', + inputs: [ + { + label: 'Connect Credential', + name: 'credential', + type: 'credential', + credentialNames: ['azureCognitiveServices'] + }, + { + label: 'Language', + name: 'language', + type: 'string', + description: 'The recognition language (e.g., "en-US", "es-ES")', + placeholder: 'en-US', + optional: true + }, + { + label: 'Profanity Filter Mode', + name: 'profanityFilterMode', + type: 'options', + description: 'How to handle profanity in the transcription', + options: [ + { + label: 'None', + name: 'None' + }, + { + label: 'Masked', + name: 'Masked' + }, + { + label: 'Removed', + name: 'Removed' + } + ], + default: 'Masked', + optional: true + }, + { + label: 'Audio Channels', + name: 'channels', + type: 'string', + description: 'Comma-separated list of audio channels to process (e.g., "0,1")', + placeholder: '0,1', + default: '0,1' + } + ] + }, [SpeechToTextType.GROQ_WHISPER]: { label: 'Groq Whisper', name: SpeechToTextType.GROQ_WHISPER,