Skip to content

Commit

Permalink
hugging face implementation #128
Browse files Browse the repository at this point in the history
  • Loading branch information
absadiki committed Apr 3, 2024
1 parent b2f3665 commit ce9834d
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 1 deletion.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ dl_translate==0.3.0
faster_whisper
whisperx @ git+https://github.com/m-bain/whisperx.git
stable-ts
openai
openai
transformers
8 changes: 8 additions & 0 deletions src/subsai/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DEFAULT_APPLY_OFFSET_SECONDS, DEFAULT_FRAME_RATE, DEFAULT_VAD

from subsai.models.faster_whisper_model import FasterWhisperModel
from subsai.models.hugging_face_model import HuggingFaceModel
from subsai.models.whisperX_model import WhisperXModel
from subsai.models.whisper_model import WhisperModel
from subsai.models.whisper_timestamped_model import WhisperTimeStamped
Expand Down Expand Up @@ -69,6 +70,13 @@
'url': 'https://platform.openai.com/docs/guides/speech-to-text',
'config_schema': WhisperAPIModel.config_schema,
},
'HuggingFace': {
'class': HuggingFaceModel,
'description': 'Hugging Face implementation of Whisper. '
'Any speech recognition pretrained model from the Hugging Face hub can be used as well',
'url': 'https://huggingface.co/tasks/automatic-speech-recognition',
'config_schema': HuggingFaceModel.config_schema,
},
}

BASIC_TOOLS_CONFIGS = {
Expand Down
79 changes: 79 additions & 0 deletions src/subsai/models/hugging_face_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Hugging Face Model
See [automatic-speech-recognition](https://huggingface.co/tasks/automatic-speech-recognition)
"""

import pysubs2
from pysubs2 import SSAFile, SSAEvent
from subsai.models.abstract_model import AbstractModel
from subsai.utils import _load_config, get_available_devices

from transformers import pipeline


devices = get_available_devices()

class HuggingFaceModel(AbstractModel):
model_name = 'HuggingFaceModel'
config_schema = {
# load model config
'model_id': {
'type': str,
'description': 'The model id from the Hugging Face Hub.',
'options': None,
'default': 'openai/whisper-tiny'
},
'device': {
'type': list,
'description': 'Pytorch device',
'options': devices,
'default': devices[0]
},
'segment_type': {
'type': list,
'description': "Sentence-level or word-level timestamps",
'options': ['sentence', 'word'],
'default': 'sentence'
},
'chunk_length_s': {
'type': float,
'description': '(`float`, *optional*, defaults to 0):'
'The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).',
'options': None,
'default': 30
}
}

def __init__(self, model_config):
super(HuggingFaceModel, self).__init__(model_config=model_config,
model_name=self.model_name)
# config
self._model_id = _load_config('model_id', model_config, self.config_schema)
self._device = _load_config('device', model_config, self.config_schema)
self.segment_type = _load_config('segment_type', model_config, self.config_schema)
self._chunk_length_s = _load_config('chunk_length_s', model_config, self.config_schema)


self.model = pipeline(
"automatic-speech-recognition",
model=self._model_id,
device=self._device,
)

def transcribe(self, media_file):
results = self.model(
media_file,
chunk_length_s=self._chunk_length_s,
return_timestamps=True if self.segment_type == 'sentence' else 'word',
)
subs = SSAFile()
for chunk in results['chunks']:
event = SSAEvent(start=pysubs2.make_time(s=chunk['timestamp'][0]),
end=pysubs2.make_time(s=chunk['timestamp'][1]))
event.plaintext = chunk['text']
subs.append(event)
return subs

0 comments on commit ce9834d

Please sign in to comment.