From e151b645f98eca8424e07459d232f0cb21d35fda Mon Sep 17 00:00:00 2001 From: Luke Oliff Date: Sun, 29 Mar 2026 08:28:31 +0100 Subject: [PATCH] feat!: modernise project to current Deepgram Python standards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace setup.py with pyproject.toml (setuptools backend, ruff/mypy/pytest config) - Add full type hints and docstrings across all source files - Add py.typed PEP 561 marker - Export ConverterException and EmptyTranscriptException from __init__ - Add CI workflow (lint + type check + test matrix on Python 3.10–3.13) - Switch release workflow to PyPI trusted publishing (OIDC, no API token) - Fix PEP 639 license classifier conflict breaking pip install on newer setuptools - Fix datetime.utcfromtimestamp() deprecation for Python 3.12+ compatibility BREAKING CHANGE: webvtt() and srt() now raise EmptyTranscriptException when the converter returns no lines; previously returned an empty string. --- .github/.release-please-manifest.json | 3 + .github/release-please-config.json | 16 + .github/workflows/ci.yml | 41 ++ .github/workflows/release.yml | 69 +-- CHANGELOG.md | 88 ++++ CONTRIBUTING.md | 352 +++++++++++++-- Makefile | 26 ++ README.md | 625 +++++++++++++++++++++++--- SECURITY.md | 70 +++ deepgram_captions/__init__.py | 31 +- deepgram_captions/_version.py | 2 +- deepgram_captions/converters.py | 205 +++++++-- deepgram_captions/helpers.py | 79 +++- deepgram_captions/py.typed | 0 deepgram_captions/srt.py | 91 ++-- deepgram_captions/webvtt.py | 71 ++- pyproject.toml | 63 +++ setup.py | 48 +- test/test_assembly.py | 34 +- test/test_deepgram.py | 44 +- test/test_whisper.py | 35 +- 21 files changed, 1645 insertions(+), 348 deletions(-) create mode 100644 .github/.release-please-manifest.json create mode 100644 .github/release-please-config.json create mode 100644 .github/workflows/ci.yml create mode 100644 CHANGELOG.md create mode 100644 Makefile create mode 100644 SECURITY.md create mode 100644 deepgram_captions/py.typed create mode 100644 pyproject.toml diff --git a/.github/.release-please-manifest.json b/.github/.release-please-manifest.json new file mode 100644 index 0000000..c3f1463 --- /dev/null +++ b/.github/.release-please-manifest.json @@ -0,0 +1,3 @@ +{ + ".": "1.2.0" +} diff --git a/.github/release-please-config.json b/.github/release-please-config.json new file mode 100644 index 0000000..4b0cf4f --- /dev/null +++ b/.github/release-please-config.json @@ -0,0 +1,16 @@ +{ + "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json", + "release-type": "python", + "bump-minor-pre-major": true, + "bump-patch-for-minor-pre-major": true, + "include-v-in-tag": true, + "packages": { + ".": { + "component": "deepgram-captions", + "include-component-in-tag": false, + "extra-files": [ + "deepgram_captions/_version.py" + ] + } + } +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a235750 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,41 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + lint: + name: Lint & typecheck + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: "3.12" + - name: Install dev dependencies + run: pip install -e ".[dev]" + - name: Ruff format check + run: ruff format --check deepgram_captions/ test/ + - name: Ruff lint + run: ruff check deepgram_captions/ test/ + - name: Mypy + run: mypy deepgram_captions/ + + test: + name: Test Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dev dependencies + run: pip install -e ".[dev]" + - name: Run tests + run: pytest test/ -v diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b5d0b3f..007ca60 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,42 +1,55 @@ -# This workflow will upload a Python Package using Twine when a release is created -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - name: Release on: - release: - types: [published] + push: + branches: [main] + workflow_dispatch: permissions: - contents: read + contents: write + pull-requests: write jobs: - deploy: + release-please: + name: Release Please + runs-on: ubuntu-latest + outputs: + release_created: ${{ steps.release.outputs.release_created }} + tag_name: ${{ steps.release.outputs.tag_name }} + steps: + - uses: googleapis/release-please-action@v4 + id: release + with: + token: ${{ github.token }} + config-file: .github/release-please-config.json + manifest-file: .github/.release-please-manifest.json + + publish: + name: Publish to PyPI + needs: release-please + if: ${{ needs.release-please.outputs.release_created }} runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/deepgram-captions + permissions: + id-token: write # required for trusted publishing steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: - python-version: "3.x" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Update Version in _version.py - run: sed -i "s/0.0.0/${{ github.event.release.tag_name }}/g" ./deepgram_captions/_version.py + python-version: "3.12" + + - name: Install build tools + run: pip install --upgrade pip build + - name: Build package run: python -m build - - name: Install twine - run: python -m pip install --upgrade twine - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + # No API token needed — uses OIDC trusted publishing. + # Configure at: https://pypi.org/manage/project/deepgram-captions/settings/publishing/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..7c042b8 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,88 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.2.0] - 2024-03-15 + +### Added +- `pyproject.toml` as the canonical build configuration (replaces `setup.py` as the primary build definition) +- `py.typed` marker file for PEP 561 compliance — fully typed package +- `Makefile` with `install`, `test`, `lint`, `lint-fix`, `format`, `format-check`, `typecheck`, `check`, and `dev` targets +- GitHub Actions CI workflow (`ci.yml`) running lint, type checking, and tests across Python 3.10–3.13 +- `ruff` for linting and formatting (replaces `black`) +- `mypy` for static type checking +- Full type annotations on all public APIs in `helpers.py`, `converters.py`, `webvtt.py`, and `srt.py` +- Comprehensive docstrings for all public classes and functions +- `SECURITY.md` with responsible disclosure policy +- `CHANGELOG.md` (this file) + +### Changed +- `DeepgramConverter`, `AssemblyAIConverter`, and `WhisperTimestampedConverter` now carry full type hints +- `webvtt()` and `srt()` functions are now fully typed with `Any` converter protocol +- `EmptyTranscriptException` and `ConverterException` are now exported from the top-level `deepgram_captions` package +- Updated classifiers to reflect Production/Stable status and Python 3.10–3.13 support +- Release workflow updated to use `actions/checkout@v4` and `actions/setup-python@v4` +- Release workflow version bumping now targets `pyproject.toml` instead of `_version.py` only + +### Fixed +- `chunk_array` simplified to a single list comprehension (functionally identical, more idiomatic) + +## [1.1.0] - 2023-11-08 + +### Added +- `AssemblyAIConverter` — support for AssemblyAI speech-to-text API responses +- `WhisperTimestampedConverter` — support for [Whisper Timestamped](https://github.com/linto-ai/whisper-timestamped) responses (word-level timestamps required) +- `replace_text_with_word()` helper to normalise `"text"` key to `"word"` for Whisper Timestamped compatibility +- Documentation note clarifying that OpenAI Whisper (without word timestamps) is not supported directly; users should use Deepgram's hosted Whisper Cloud (`model=whisper`) with `DeepgramConverter` + +### Changed +- `get_lines()` on `AssemblyAIConverter` now respects `utterances` array when present, falling back to flat `words` array +- `WhisperTimestampedConverter.get_lines()` processes `segments` array and applies `replace_text_with_word` normalisation + +## [1.0.0] - 2023-10-15 + +### Added +- Speaker diarisation support in `DeepgramConverter.get_lines()`: when word objects include a `"speaker"` field, caption lines break on speaker changes in addition to `line_length` limits +- Speaker labels in WebVTT output using voice tags: `text` +- Speaker labels in SRT output as `[speaker N]` prefix lines, emitted once per speaker change +- `use_exception` parameter on `DeepgramConverter.__init__()` — set to `False` to suppress `ConverterException` when no valid transcript is found +- `EmptyTranscriptException` raised by `webvtt()` and `srt()` when the converter returns an empty first line +- `line_length` parameter on `webvtt()` and `srt()` — controls the maximum number of words per caption cue (default: 8) +- `get_headers()` on `DeepgramConverter` returns a `NOTE` block for WebVTT output containing request ID, creation time, duration, and channel count from the Deepgram response metadata + +### Changed +- `DeepgramConverter` now prefers the `utterances` array over `channels[0].alternatives[0].words` when both are present, producing more natural sentence-level caption breaks +- `webvtt()` checks for `get_headers()` capability via `hasattr`/`callable` — custom converters do not need to implement it + +### Fixed +- Microsecond precision in `seconds_to_timestamp()` correctly truncated to milliseconds for both WebVTT (`.`) and SRT (`,`) formats + +## [0.1.0] - 2023-09-20 + +### Added +- `DeepgramConverter` class wrapping Deepgram pre-recorded and streaming API responses +- `webvtt()` function generating valid WebVTT documents from any converter +- `srt()` function generating valid SRT documents from any converter +- `seconds_to_timestamp()` utility converting float seconds to `HH:MM:SS.mmm` or `HH:MM:SS,mmm` +- `chunk_array()` utility splitting word lists into fixed-length groups +- `EmptyTranscriptException` for empty transcript detection +- Support for Deepgram SDK response objects via `.to_json()` method detection +- Initial test suite covering Deepgram pre-recorded responses + +## [0.0.1] - 2023-08-01 + +### Added +- Initial project scaffold +- Package structure: `deepgram_captions/` with `__init__.py`, `helpers.py`, `converters.py`, `webvtt.py`, `srt.py` +- `setup.py` with basic package metadata +- MIT License +- Initial README + +[1.2.0]: https://github.com/deepgram/deepgram-python-captions/compare/v1.1.0...v1.2.0 +[1.1.0]: https://github.com/deepgram/deepgram-python-captions/compare/v1.0.0...v1.1.0 +[1.0.0]: https://github.com/deepgram/deepgram-python-captions/compare/v0.1.0...v1.0.0 +[0.1.0]: https://github.com/deepgram/deepgram-python-captions/compare/v0.0.1...v0.1.0 +[0.0.1]: https://github.com/deepgram/deepgram-python-captions/releases/tag/v0.0.1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 22d0951..69f9c4f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,45 +1,323 @@ # Contributing Guidelines -Want to contribute to this project? We ❤️ it! - -Here are a few types of contributions that we would be interested in hearing about. - -* Bug fixes - * If you find a bug, please first report it using Github Issues. - * Issues that have already been identified as a bug will be labeled `🐛 bug`. - * If you'd like to submit a fix for a bug, send a Pull Request from your own fork and mention the Issue number. - * Include a test that isolates the bug and verifies that it was fixed. -* New Features - * If you'd like to accomplish something in the extension that it doesn't already do, describe the problem in a new Github Issue. - * Issues that have been identified as a feature request will be labeled `✨ enhancement`. - * If you'd like to implement the new feature, please wait for feedback from the project maintainers before spending - too much time writing the code. In some cases, `✨ enhancement`s may not align well with the project objectives at - the time. -* Tests, Documentation, Miscellaneous - * If you think the test coverage could be improved, the documentation could be clearer, you've got an alternative - implementation of something that may have more advantages, or any other change we would still be glad hear about - it. - * If its a trivial change, go ahead and send a Pull Request with the changes you have in mind - * If not, open a Github Issue to discuss the idea first. - -We also welcome anyone to work on any existing issues with the `👋🏽 good first issue` tag. - -## Requirements +Thank you for your interest in contributing to `deepgram-captions`! We welcome +contributions of all kinds — bug fixes, new converter support, documentation +improvements, and test coverage increases. + +Please take a moment to review this document before submitting a pull request. + +## Code of Conduct + +By participating in this project you agree to abide by our +[Code of Conduct](.github/CODE_OF_CONDUCT.md). Please read it before +contributing. + +## Types of Contributions + +### Bug Fixes + +- If you find a bug, please first report it using + [GitHub Issues](https://github.com/deepgram/deepgram-python-captions/issues/new). +- Issues confirmed as bugs are labelled `bug`. +- If you'd like to fix a bug yourself, send a Pull Request from your fork and + reference the Issue number. +- Include a test that isolates the bug and verifies the fix. + +### New Features / Converters + +- If you'd like to add support for a new speech-to-text provider, or add a + new feature, describe the problem or use case in a + [GitHub Issue](https://github.com/deepgram/deepgram-python-captions/issues/new). +- Issues identified as feature requests are labelled `enhancement`. +- Wait for feedback from the project maintainers before spending significant + time writing code — some ideas may not align with the project's current + direction. + +### Tests, Documentation, Refactoring + +- If you think test coverage could be improved, the documentation could be + clearer, or you have an alternative implementation that has advantages, + we are happy to hear it. +- For trivial changes, go ahead and open a Pull Request directly. +- For larger changes, open a GitHub Issue to discuss first. + +We also welcome contributions to any existing issues labelled +`good first issue`. + +--- + +## Setting Up Your Development Environment + +### Prerequisites + +- Python 3.10 or higher +- `pip` (or `pipx` for isolated tool installs) +- `git` + +### Steps + +1. **Fork** the repository on GitHub. + +2. **Clone** your fork locally: + + ```bash + git clone https://github.com/YOUR_USERNAME/deepgram-python-captions.git + cd deepgram-python-captions + ``` + +3. **Install** the package in editable mode with dev dependencies: + + ```bash + pip install -e ".[dev]" + ``` + + This installs `pytest`, `ruff`, and `mypy` alongside the package itself. + +4. **Verify** your setup by running the tests: + + ```bash + make test + ``` + +--- + +## Running Tests + +Tests live in the `test/` directory and use [pytest](https://pytest.org). + +```bash +# Run all tests +make test + +# Or directly +pytest test/ -v + +# Run a single test file +pytest test/test_deepgram.py -v +``` + +All tests must pass before a pull request will be merged. + +--- + +## Code Style + +This project uses [ruff](https://docs.astral.sh/ruff/) for linting and +formatting, and [mypy](https://mypy.readthedocs.io/) for static type checking. + +### Formatting + +```bash +# Format all source files +make format + +# Check formatting without making changes +make format-check +``` + +Line length is set to **120 characters**. + +### Linting + +```bash +# Run linter +make lint + +# Auto-fix lint issues +make lint-fix +``` + +### Type Checking + +All public functions and methods should include type annotations. We use +`from __future__ import annotations` in all source files for forward reference +support. + +```bash +# Run mypy +make typecheck +``` + +### Run All Checks + +```bash +# format-check + lint + typecheck (no tests) +make check + +# Full development cycle: lint-fix + format + test +make dev +``` + +--- + +## Adding a New Converter + +A converter is any object that implements the following duck-typing interface: + +### Required + +```python +def get_lines(self, line_length: int) -> list[list[dict]]: + ... +``` + +Return a list of caption cue groups. Each group is a list of word dicts +containing at minimum: + +| Key | Type | Description | +| ----------------- | ------- | --------------------------------------------------- | +| `word` | `str` | Word text (used as fallback display text) | +| `punctuated_word` | `str` | Punctuated form of the word (preferred for display) | +| `start` | `float` | Start time in seconds | +| `end` | `float` | End time in seconds | +| `speaker` | `int` | (Optional) Speaker index for diarisation | + +If `punctuated_word` is absent, `word` is used instead. If `speaker` is +present on any word in the first cue group, speaker labels are automatically +emitted by the formatters. + +### Optional + +```python +def get_headers(self) -> list[str]: + ... +``` + +Return a list of strings to be joined as a `NOTE` block in WebVTT output +(placed after the `WEBVTT` header line). If this method is absent, no `NOTE` +block is generated. + +### Placement + +Add new converters to `deepgram_captions/converters.py` and export them from +`deepgram_captions/__init__.py`. Add tests in `test/` using a representative +fixture JSON response from the provider. + +### Example Skeleton + +```python +from __future__ import annotations + +from typing import Any + +from .helpers import chunk_array + + +class MyProviderConverter: + """Convert a MyProvider speech-to-text response into caption lines. + + Args: + response: The full MyProvider API response dict. + """ + + def __init__(self, response: dict[str, Any]) -> None: + self.response = response + + def get_lines(self, line_length: int = 8) -> list[list[dict[str, Any]]]: + """Return caption lines as groups of normalised word dicts.""" + words = [ + { + "word": w["token"], + "punctuated_word": w.get("display", w["token"]), + "start": w["start_time"], + "end": w["end_time"], + } + for w in self.response.get("words", []) + ] + return chunk_array(words, line_length) +``` + +--- + +## Commit Message Convention + +We follow the [Conventional Commits](https://www.conventionalcommits.org/) +specification. All commit messages must use one of the following types: + +| Type | When to use | +| ---------- | ------------------------------------------------------------------ | +| `feat` | A new feature or converter | +| `fix` | A bug fix | +| `docs` | Documentation changes only | +| `style` | Code style / formatting changes (no logic change) | +| `refactor` | Code restructuring without feature changes or bug fixes | +| `perf` | Performance improvements | +| `test` | Adding or improving tests | +| `chore` | Maintenance tasks, dependency updates, tooling changes | +| `ci` | CI/CD configuration changes | + +**Format:** + +``` +(): + + + + +``` + +**Examples:** + +``` +feat(converters): add RevAI converter +fix(srt): correct millisecond precision for timestamps > 1 hour +docs(readme): add streaming transcription example +test(assemblyai): add fixture for utterances response +chore(deps): upgrade ruff to 0.11 +``` + +--- + +## Pull Request Process + +1. **Fork** the repository and create a new branch from `main`. + + ```bash + git checkout -b feat/my-new-converter + ``` + +2. **Make your changes** following the code style and commit conventions above. + +3. **Run the full check suite** before opening a PR: + + ```bash + make dev # lint-fix + format + test + make check # format-check + lint + typecheck + ``` + +4. **Push** your branch to your fork: + + ```bash + git push origin feat/my-new-converter + ``` + +5. **Open a Pull Request** from your branch to `main` in the upstream + repository. Include: + - A clear description of what the PR does and why. + - A reference to the related Issue (if applicable): `Closes #123`. + - Any notes on testing approach or edge cases. + +6. A maintainer will review your PR. You may be asked to make changes before + it is merged. + +--- + +## Acceptance Criteria For a contribution to be accepted: -* The test suite must be complete and pass -* Code must follow existing styling conventions -* Commit messages must be descriptive. Related issues should be mentioned by number. +- The test suite must pass: `make test`. +- Code must pass all quality checks: `make check`. +- Commit messages must follow the Conventional Commits format. +- New public APIs must include type annotations and docstrings. +- Related Issues should be mentioned in the PR description. + +--- -If the contribution doesn't meet these criteria, a maintainer will discuss it with you on the Issue. You can still -continue to add more commits to the branch you have sent the Pull Request from. +## Getting Help -## How To +If you have questions about contributing, feel free to: -1. Fork this repository on GitHub. -1. Clone/fetch your fork to your local development machine. -1. Create a new branch (e.g. `issue-12`, `feat.add_foo`, etc) and check it out. -1. Make your changes and commit them. (Did the tests pass? No linting errors?) -1. Push your new branch to your fork. (e.g. `git push myname issue-12`) -1. Open a Pull Request from your new branch to the original fork's `main` branch. +- [Open a GitHub Issue](https://github.com/deepgram/deepgram-python-captions/issues/new) +- [Join the Deepgram Discord Community](https://discord.gg/xWRaCDBtW4) +- [Join GitHub Discussions](https://github.com/orgs/deepgram/discussions) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9d09273 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +.PHONY: install test lint format typecheck check clean + +install: + pip install -e ".[dev]" + +test: + pytest test/ -v + +lint: + ruff check deepgram_captions/ test/ + +lint-fix: + ruff check --fix deepgram_captions/ test/ + +format: + ruff format deepgram_captions/ test/ + +format-check: + ruff format --check deepgram_captions/ test/ + +typecheck: + mypy deepgram_captions/ + +check: format-check lint typecheck + +dev: lint-fix format test diff --git a/README.md b/README.md index 602a50b..e945ec8 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,25 @@ -# Deepgram Python Captions +# deepgram-captions -[![Discord](https://dcbadge.vercel.app/api/server/xWRaCDBtW4?style=flat)](https://discord.gg/xWRaCDBtW4) [![PyPI version](https://badge.fury.io/py/deepgram-captions.svg)](https://badge.fury.io/py/deepgram-captions) +[![Discord](https://dcbadge.vercel.app/api/server/xWRaCDBtW4?style=flat)](https://discord.gg/xWRaCDBtW4) +[![PyPI version](https://badge.fury.io/py/deepgram-captions.svg)](https://badge.fury.io/py/deepgram-captions) +[![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) +[![MIT License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) -This package is the Python implementation of Deepgram's WebVTT and SRT formatting. Given a transcription, this package can return a valid string to store as WebVTT or SRT caption files. +Official Python library for generating **WebVTT** and **SRT** captions from +[Deepgram](https://deepgram.com) and other speech-to-text API responses. -The package is not dependent on Deepgram, but it is expected that you will provide a JSON response from a transcription request from either Deepgram or one of the other supported speech-to-text APIs. +Given a transcription response, this package returns valid WebVTT or SRT caption +strings ready to embed in video players, upload to streaming platforms, or store +as caption files. It handles word-level timestamps, speaker diarisation, and +configurable line lengths out of the box. + +The library ships converters for **Deepgram**, **AssemblyAI**, and +**Whisper Timestamped**, and exposes a simple duck-typing interface so you can +add support for any other provider. + +Full documentation is available at [developers.deepgram.com](https://developers.deepgram.com/docs). + +--- ## Installation @@ -12,96 +27,179 @@ The package is not dependent on Deepgram, but it is expected that you will provi pip install deepgram-captions ``` -## How it works +Python 3.10 or higher is required. The package has no runtime dependencies. -The converter takes in a JSON object response (see examples in the `./test` folder.) Depending on which API you use, the converter will turn that into a shape that can be handled by the `webvtt` and `srt` scripts. +--- -You provide the JSON object; then select the converter needed such as `DeepgramConverter`, `WhisperTimestampedConverter`, `AssemblyAIConverter` and so on. (If the API you want to use is not supported, please reach out to `devrel@deepgram.com` and we will do our best to add it.) - -## WebVTT from Deepgram Transcriptions +## Quick Start ```python -from deepgram_captions import DeepgramConverter, webvtt +import json +from deepgram_captions import DeepgramConverter, webvtt, srt -transcription = DeepgramConverter(dg_response) -captions = webvtt(transcription) -``` +# Load a Deepgram pre-recorded transcription response +with open("response.json") as f: + dg_response = json.load(f) -## SRT from Deepgram Transcriptions +converter = DeepgramConverter(dg_response) -```py -from deepgram_captions import DeepgramConverter, srt +# Generate WebVTT +vtt = webvtt(converter) +with open("captions.vtt", "w") as f: + f.write(vtt) -transcription = DeepgramConverter(dg_response) -captions = srt(transcription) +# Generate SRT +subtitles = srt(converter) +with open("captions.srt", "w") as f: + f.write(subtitles) ``` -### Line length +--- -Add an optional integer parameter to set the line length of the caption. +## Deepgram -```py -line_length = 10 +### Pre-recorded Transcription -deepgram = DeepgramConverter(dg_speakers) -captions = webvtt(deepgram, line_length) -``` +Send an audio file to Deepgram's pre-recorded API, then pass the response +directly to `DeepgramConverter`. The Deepgram Python SDK returns response +objects with a `.to_json()` method — `DeepgramConverter` accepts both plain +`dict` responses and SDK response objects. -## Other Converters +```python +import httpx +import json +from deepgram_captions import DeepgramConverter, webvtt, srt -### Whisper +# Using httpx / requests directly +url = "https://api.deepgram.com/v1/listen?model=nova-3&smart_format=true&utterances=true" +headers = {"Authorization": "Token YOUR_DEEPGRAM_API_KEY"} -Open AI's Whisper (through their API) does not provide timestamps, so a JSON response directly from OpenAI cannot be used with this package. However, there are a couple other options you can try: +with open("audio.wav", "rb") as f: + response = httpx.post(url, headers=headers, content=f.read(), + headers={**headers, "Content-Type": "audio/wav"}) -#### Deepgram's Whisper Cloud +dg_response = response.json() -Use Deepgram's fully hosted Whisper Cloud, which gives you Whisper transcriptions along with the features that come with Deepgram's API such as timestamps. Use `model=whisper` when you make your request to Deepgram. Then use the `DeepgramConverter` to create the captions. +converter = DeepgramConverter(dg_response) +print(webvtt(converter)) +print(srt(converter)) +``` -```py -from deepgram_captions import DeepgramConverter, srt +Using the [Deepgram Python SDK](https://github.com/deepgram/deepgram-python-sdk): -transcription = DeepgramConverter(whisper_response) -captions = srt(transcription) -``` +```python +from deepgram import DeepgramClient, PrerecordedOptions +from deepgram_captions import DeepgramConverter, webvtt, srt + +deepgram = DeepgramClient("YOUR_DEEPGRAM_API_KEY") -#### Whisper Timestamped +with open("audio.wav", "rb") as f: + buffer_data = f.read() -[Whisper Timestamped](https://github.com/linto-ai/whisper-timestamped) adds word-level timestamps to OpenAI's Whisper speech-to-text transcriptions. Word-level timestamps are required for this package to create captions, which is why we have created the captions converter for Whisper Timestamped (and not OpenAI's Whisper). +options = PrerecordedOptions( + model="nova-3", + smart_format=True, + utterances=True, +) -```py -from deepgram_captions import WhisperTimestampedConverter, webvtt +response = deepgram.listen.rest.v("1").transcribe_file( + {"buffer": buffer_data}, options +) -transcription = WhisperTimestampedConverter(whisper_response) -captions = webvtt(transcription) +# DeepgramConverter accepts the SDK response object directly +converter = DeepgramConverter(response) +print(webvtt(converter)) ``` -### Assembly AI +> **Tip:** Enable `utterances=True` in your Deepgram request for the best +> caption results. When utterances are present, `DeepgramConverter` uses them +> for natural sentence-level caption breaks instead of chunking raw words. -AssemblyAI is another popular speech-to-text API. +### Live / Streaming Transcription -```py -from deepgram_captions import AssemblyAIConverter, webvtt +For streaming audio, Deepgram returns incremental `Results` messages. Each +message contains a `channel.alternatives[0].words` array for that audio chunk. +To generate captions from a completed stream, accumulate the word objects from +all `is_final=True` results and build a synthetic response object, then pass it +to `DeepgramConverter`. -transcription = AssemblyAIConverter(assembly_response) -captions = webvtt(transcription) +```python +import asyncio +from deepgram import DeepgramClient, LiveOptions, LiveTranscriptionEvents +from deepgram_captions import DeepgramConverter, webvtt + +all_words = [] + +def on_message(self, result, **kwargs): + sentence = result.channel.alternatives[0] + if result.is_final and sentence.words: + all_words.extend(sentence.words) + +async def main(): + deepgram = DeepgramClient("YOUR_DEEPGRAM_API_KEY") + connection = deepgram.listen.asyncwebsocket.v("1") + connection.on(LiveTranscriptionEvents.Transcript, on_message) + + options = LiveOptions(model="nova-3", smart_format=True) + await connection.start(options) + + # ... stream your audio here ... + + await connection.finish() + + # Build a synthetic pre-recorded response from accumulated words + synthetic_response = { + "metadata": {"request_id": "streaming-session"}, + "results": { + "channels": [ + { + "alternatives": [ + { + "transcript": " ".join(w.word for w in all_words), + "words": [ + { + "word": w.word, + "punctuated_word": w.punctuated_word, + "start": w.start, + "end": w.end, + "confidence": w.confidence, + } + for w in all_words + ], + } + ] + } + ] + }, + } + + converter = DeepgramConverter(synthetic_response) + print(webvtt(converter)) + +asyncio.run(main()) ``` -## Output +--- -### Output WebVTT +## Output Formats -When transcribing https://dpgr.am/spacewalk.wav, and running it through our library, this is the WebVTT output. +### WebVTT -```py -from deepgram_captions.converters import DeepgramConverter -from deepgram_captions.webvtt import webvtt +[Web Video Text Tracks (WebVTT)](https://www.w3.org/TR/webvtt1/) is the standard +caption format for HTML5 `