Transformers
PyTorch
multilingual
seamless_basic
audio
text
multimodal
seamless
subtitle-editing-time-prediction
Instructions to use videoloc/seamless-basic with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use videoloc/seamless-basic with Transformers:
# Load model directly from transformers import HFSeamlessBasic model = HFSeamlessBasic.from_pretrained("videoloc/seamless-basic", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| # Example usage for videoloc/seamless-basic | |
| from transformers import AutoModel, AutoConfig | |
| from huggingface_hub import hf_hub_download | |
| import torch | |
| import numpy as np | |
| import importlib.util | |
| def load_model_and_collator(): | |
| # Load model - custom architecture requires importing the model class | |
| model_files = hf_hub_download(repo_id="videoloc/seamless-basic", filename="modeling_seamless_basic.py") | |
| spec = importlib.util.spec_from_file_location("modeling_seamless_basic", model_files) | |
| modeling_module = importlib.util.module_from_spec(spec) | |
| spec.loader.exec_module(modeling_module) | |
| # Now load the model using the custom class | |
| config = modeling_module.SeamlessBasicConfig.from_pretrained("videoloc/seamless-basic") | |
| model = modeling_module.HFSeamlessBasic.from_pretrained("videoloc/seamless-basic") | |
| # Load data collator | |
| collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py") | |
| spec = importlib.util.spec_from_file_location("data_collator", collator_file) | |
| collator_module = importlib.util.module_from_spec(spec) | |
| spec.loader.exec_module(collator_module) | |
| data_collator = collator_module.DataCollatorSimpleSeamless( | |
| processor="facebook/hf-seamless-m4t-medium", | |
| max_audio_length_sec=8.0, | |
| max_text_length=256 | |
| ) | |
| return model, data_collator | |
| def example_inference(): | |
| model, collator = load_model_and_collator() | |
| # Example data: audio segment + subtitle text to predict editing time | |
| data = [{ | |
| 'raw_audio': np.random.randn(16000 * 3), # 3 seconds at 16kHz | |
| 'raw_text': "Hello, welcome to our presentation today.", | |
| }] | |
| batch = collator(data) | |
| model.eval() | |
| with torch.no_grad(): | |
| outputs = model(**batch) | |
| tte_prediction = outputs.logits.item() | |
| print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds") | |
| return tte_prediction | |
| if __name__ == "__main__": | |
| example_inference() | |