-
Notifications
You must be signed in to change notification settings - Fork 131
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Addresses #26
- Loading branch information
Showing
1 changed file
with
159 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
/** | ||
* Example provided by https://github.com/Gan-Xing in https://github.com/fedirz/faster-whisper-server/issues/26 | ||
*/ | ||
import fs from 'fs'; | ||
import WebSocket from 'ws'; | ||
import fetch from 'node-fetch'; | ||
import FormData from 'form-data'; | ||
import path from 'path'; | ||
import ffmpeg from 'fluent-ffmpeg'; | ||
import dotenv from 'dotenv'; | ||
|
||
dotenv.config(); | ||
|
||
const ffmpegPath = process.env.FFMPEG_PATH || '/usr/bin/ffmpeg'; | ||
ffmpeg.setFfmpegPath(ffmpegPath); | ||
|
||
/** | ||
* Transcribe an audio file using the HTTP endpoint. | ||
* Supported file types include wav, mp3, webm, and other types supported by the OpenAI API. | ||
* I have tested with these three types. | ||
* | ||
* @param {string} filePath - Path to the audio file | ||
* @param {string} model - Model name | ||
* @param {string} language - Language code | ||
* @param {string} responseFormat - Response format | ||
* @param {string} temperature - Temperature setting | ||
*/ | ||
async function transcribeFile(filePath, model, language, responseFormat, temperature) { | ||
const formData = new FormData(); | ||
formData.append('file', fs.createReadStream(filePath)); | ||
formData.append('model', model); | ||
formData.append('language', language); | ||
formData.append('response_format', responseFormat); | ||
formData.append('temperature', temperature); | ||
|
||
const response = await fetch(`${process.env.TRANSCRIPTION_API_BASE_URL}/v1/audio/transcriptions`, { | ||
method: 'POST', | ||
body: formData, | ||
}); | ||
|
||
const transcription = await response.json(); | ||
console.log('Transcription Response:', transcription); | ||
} | ||
|
||
/** | ||
* Translate an audio file using the HTTP endpoint. | ||
* Only English is supported for translation. | ||
* Currently, I am using GLM-4-9b-int8 to translate various voices. | ||
* I am not sure if the author can add an endpoint for custom API+Key translation. | ||
* I plan to package my frontend, fast-whisper-server, and vllm+glm-4-9b-int8 into one Docker container for unified deployment. | ||
* | ||
* @param {string} filePath - Path to the audio file | ||
* @param {string} model - Model name | ||
* @param {string} responseFormat - Response format | ||
* @param {string} temperature - Temperature setting | ||
*/ | ||
async function translateFile(filePath, model, responseFormat, temperature) { | ||
const formData = new FormData(); | ||
formData.append('file', fs.createReadStream(filePath)); | ||
formData.append('model', model); | ||
formData.append('response_format', responseFormat); | ||
formData.append('temperature', temperature); | ||
|
||
const response = await fetch(`${process.env.TRANSLATION_API_BASE_URL}/v1/audio/translations`, { | ||
method: 'POST', | ||
body: formData, | ||
}); | ||
|
||
const translation = await response.json(); | ||
console.log('Translation Response:', translation); | ||
} | ||
|
||
/** | ||
* Send audio data over WebSocket for transcription. | ||
* Currently, the supported file type for transcription is PCM. | ||
* I am not sure if other types are supported. | ||
* | ||
* @param {string} filePath - Path to the audio file | ||
* @param {string} model - Model name | ||
* @param {string} language - Language code | ||
* @param {string} responseFormat - Response format | ||
* @param {string} temperature - Temperature setting | ||
*/ | ||
async function sendAudioOverWebSocket(filePath, model, language, responseFormat, temperature) { | ||
const wsUrl = `ws://100.105.162.69:8000/v1/audio/transcriptions?model=${encodeURIComponent(model)}&language=${encodeURIComponent(language)}&response_format=${encodeURIComponent(responseFormat)}&temperature=${encodeURIComponent(temperature)}`; | ||
const ws = new WebSocket(wsUrl); | ||
|
||
ws.on('open', async () => { | ||
const audioBuffer = fs.readFileSync(filePath); | ||
ws.send(audioBuffer); | ||
}); | ||
|
||
ws.on('message', (message) => { | ||
const response = JSON.parse(message); | ||
console.log('WebSocket Response:', response); | ||
}); | ||
|
||
ws.on('close', () => { | ||
console.log('WebSocket connection closed'); | ||
}); | ||
|
||
ws.on('error', (error) => { | ||
console.error('WebSocket error:', error); | ||
}); | ||
} | ||
|
||
/** | ||
* Convert audio file to PCM format. | ||
* | ||
* @param {string} filePath - Path to the audio file | ||
* @returns {string} - Path to the converted PCM file | ||
*/ | ||
async function convertToPcm(filePath) { | ||
const pcmFilePath = filePath.replace(path.extname(filePath), '.pcm'); | ||
|
||
await new Promise((resolve, reject) => { | ||
ffmpeg(filePath) | ||
.audioChannels(1) | ||
.audioFrequency(16000) | ||
.audioCodec('pcm_s16le') | ||
.toFormat('s16le') | ||
.on('end', () => { | ||
console.log(`Audio file successfully converted to PCM: ${pcmFilePath}`); | ||
resolve(pcmFilePath); | ||
}) | ||
.on('error', (error) => { | ||
console.error(`Error converting audio to PCM: ${error.message}`); | ||
reject(error); | ||
}) | ||
.save(pcmFilePath); | ||
}); | ||
|
||
return pcmFilePath; | ||
} | ||
|
||
async function main() { | ||
const model = 'Systran/faster-whisper-large-v3'; | ||
const language = 'en'; | ||
const responseFormat = 'json'; | ||
const temperature = '0'; | ||
const filePath = './path/to/your/audio.webm'; // Replace with the actual file path | ||
|
||
// Convert the audio file to PCM format | ||
const pcmFilePath = await convertToPcm(filePath); | ||
|
||
// Transcribe the audio file using the HTTP endpoint | ||
await transcribeFile(pcmFilePath, model, language, responseFormat, temperature); | ||
|
||
// Translate the audio file using the HTTP endpoint | ||
await translateFile(pcmFilePath, model, responseFormat, temperature); | ||
|
||
// Transcribe the audio file using the WebSocket endpoint | ||
await sendAudioOverWebSocket(pcmFilePath, model, language, responseFormat, temperature); | ||
} | ||
|
||
// Make sure to use ffmpeg version 7 or above. The default apt-get install only installs version 4.x. Also, Ubuntu 22.04 or above is required to support version 7.x. | ||
main().catch(console.error); | ||
|
||
// Project URL: https://github.com/Gan-Xing/whisper |