Skip to content

Commit ee98bac

Browse files
micaelakaplanmakseqgithub-advanced-security[bot]
authored
feat: deepgram text to speech (#831)
Co-authored-by: makseq <makseq@gmail.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
1 parent fb0fae1 commit ee98bac

File tree

13 files changed

+734
-0
lines changed

13 files changed

+734
-0
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ jobs:
113113
if: ${{ matrix.skip_pytests != 'true' }}
114114
env:
115115
ML_BACKEND: ${{ matrix.backend_dir_name }}
116+
TEST_ENV: "true"
116117
run: |
117118
docker compose -f label_studio_ml/examples/${{ matrix.backend_dir_name }}/docker-compose.yml exec -T ${{ matrix.backend_dir_name }} pytest -vvv --cov --cov-report=xml:/tmp/coverage.xml
118119
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# syntax=docker/dockerfile:1
2+
ARG PYTHON_VERSION=3.13
3+
4+
FROM python:${PYTHON_VERSION}-slim AS python-base
5+
ARG TEST_ENV
6+
7+
WORKDIR /app
8+
9+
ENV PYTHONUNBUFFERED=1 \
10+
PYTHONDONTWRITEBYTECODE=1 \
11+
PORT=${PORT:-9090} \
12+
PIP_CACHE_DIR=/.cache \
13+
WORKERS=1 \
14+
THREADS=8
15+
16+
# Update the base OS
17+
RUN --mount=type=cache,target="/var/cache/apt",sharing=locked \
18+
--mount=type=cache,target="/var/lib/apt/lists",sharing=locked \
19+
set -eux; \
20+
apt-get update; \
21+
apt-get upgrade -y; \
22+
apt install --no-install-recommends -y \
23+
git; \
24+
apt-get autoremove -y
25+
26+
# install base requirements
27+
COPY requirements-base.txt .
28+
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
29+
pip install -r requirements-base.txt
30+
31+
# install custom requirements
32+
COPY requirements.txt .
33+
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
34+
pip install -r requirements.txt
35+
36+
# install test requirements if needed
37+
COPY requirements-test.txt .
38+
# build only when TEST_ENV="true"
39+
RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
40+
if [ "$TEST_ENV" = "true" ]; then \
41+
pip install -r requirements-test.txt; \
42+
fi
43+
44+
COPY . .
45+
46+
EXPOSE 9090
47+
CMD gunicorn --preload --bind :$PORT --workers $WORKERS --threads $THREADS --timeout 0 _wsgi:app
48+
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
# Using Deepgram with Label Studio for Text to Speech annotation
3+
4+
This backend uses the Deepgram API to take the input text from the user, do text to speech, and return the output audio for annotation in Label Studio.
5+
6+
https://github.com/user-attachments/assets/9569a955-0baf-4a95-9e8a-d08250a0a298
7+
8+
9+
IMPORTANT NOTE: YOU MUST REFRESH THE PAGE AFTER SUBMITTING THE TEXT TO SEE THE AUDIO APPEAR.
10+
11+
## Prerequistes
12+
1. [Deepgram API Key](https://deepgram.com/) -- create an account and follow the instructions to get an api key with default permissions. Store this key as `DEEPGRAM_API_KEY` in `docker_compose.yml`
13+
2. AWS Storage -- make sure you configure the following parameters in `docker_compose.yml`:
14+
- `AWS_ACCESS_KEY_ID` -- your AWS access key id
15+
- `AWS_SECRET_ACCESS_KEY` -- your AWS secret access key
16+
- `AWS_SESSION_TOKEN` -- your AWS session token
17+
- `AWS_DEFAULT_REGION` - the region you want to use for S3
18+
- `S3_BUCKET` -- the name of the bucket where you'd like to store the created audio files
19+
- `S3_FOLDER` -- the name of the folder within the specified bucket where you'd like to store the audio files.
20+
3. Label Studio -- make sure you set your `LABEL_STUDIO_URL` and your `LABEL_STUDIO_API_KEY` in `docker_compose.yml`. As of 11/12/25, you must use the LEGACY TOKEN.
21+
22+
## Labeling Config
23+
This is the base labeling config to be used with this backend. Note that you may add additional annotations to the document after the audio without breaking anything!
24+
```
25+
<View>
26+
<Header value="What would you like to TTS?"/>
27+
<TextArea name="text" toName="audio" placeholder="What do you want to tts?" value="$text" valrows="4" maxSubmissions="1"/>
28+
<Audio name="audio" value="$audio" zoom="true" hotkey="ctrl+enter"/>
29+
</View>
30+
```
31+
## A Data Note
32+
Note that in order for this to work, you need to upload dummy data (i.e. empty text and audio) so that the tasks populate. You can use `dummy_data.json` as this data.
33+
34+
## Configuring the backend
35+
When you attach the model to Label Studio in your model settings, make sure to toggle ON interactive preannotations!
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import os
2+
import argparse
3+
import json
4+
import logging
5+
import logging.config
6+
7+
logging.config.dictConfig({
8+
"version": 1,
9+
"disable_existing_loggers": False,
10+
"formatters": {
11+
"standard": {
12+
"format": "[%(asctime)s] [%(levelname)s] [%(name)s::%(funcName)s::%(lineno)d] %(message)s"
13+
}
14+
},
15+
"handlers": {
16+
"console": {
17+
"class": "logging.StreamHandler",
18+
"level": os.getenv('LOG_LEVEL'),
19+
"stream": "ext://sys.stdout",
20+
"formatter": "standard"
21+
}
22+
},
23+
"root": {
24+
"level": os.getenv('LOG_LEVEL'),
25+
"handlers": [
26+
"console"
27+
],
28+
"propagate": True
29+
}
30+
})
31+
32+
from label_studio_ml.api import init_app
33+
from model import DeepgramModel
34+
35+
36+
_DEFAULT_CONFIG_PATH = os.path.join(os.path.dirname(__file__), 'config.json')
37+
38+
39+
def get_kwargs_from_config(config_path=_DEFAULT_CONFIG_PATH):
40+
if not os.path.exists(config_path):
41+
return dict()
42+
with open(config_path) as f:
43+
config = json.load(f)
44+
assert isinstance(config, dict)
45+
return config
46+
47+
48+
if __name__ == "__main__":
49+
parser = argparse.ArgumentParser(description='Label studio')
50+
parser.add_argument(
51+
'-p', '--port', dest='port', type=int, default=9090,
52+
help='Server port')
53+
parser.add_argument(
54+
'--host', dest='host', type=str, default='0.0.0.0',
55+
help='Server host')
56+
parser.add_argument(
57+
'--kwargs', '--with', dest='kwargs', metavar='KEY=VAL', nargs='+', type=lambda kv: kv.split('='),
58+
help='Additional LabelStudioMLBase model initialization kwargs')
59+
parser.add_argument(
60+
'-d', '--debug', dest='debug', action='store_true',
61+
help='Switch debug mode')
62+
parser.add_argument(
63+
'--log-level', dest='log_level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default=None,
64+
help='Logging level')
65+
parser.add_argument(
66+
'--model-dir', dest='model_dir', default=os.path.dirname(__file__),
67+
help='Directory where models are stored (relative to the project directory)')
68+
parser.add_argument(
69+
'--check', dest='check', action='store_true',
70+
help='Validate model instance before launching server')
71+
parser.add_argument('--basic-auth-user',
72+
default=os.environ.get('ML_SERVER_BASIC_AUTH_USER', None),
73+
help='Basic auth user')
74+
75+
parser.add_argument('--basic-auth-pass',
76+
default=os.environ.get('ML_SERVER_BASIC_AUTH_PASS', None),
77+
help='Basic auth pass')
78+
79+
args = parser.parse_args()
80+
81+
# setup logging level
82+
if args.log_level:
83+
logging.root.setLevel(args.log_level)
84+
85+
def isfloat(value):
86+
try:
87+
float(value)
88+
return True
89+
except ValueError:
90+
return False
91+
92+
def parse_kwargs():
93+
param = dict()
94+
for k, v in args.kwargs:
95+
if v.isdigit():
96+
param[k] = int(v)
97+
elif v == 'True' or v == 'true':
98+
param[k] = True
99+
elif v == 'False' or v == 'false':
100+
param[k] = False
101+
elif isfloat(v):
102+
param[k] = float(v)
103+
else:
104+
param[k] = v
105+
return param
106+
107+
kwargs = get_kwargs_from_config()
108+
109+
if args.kwargs:
110+
kwargs.update(parse_kwargs())
111+
112+
if args.check:
113+
print('Check "' + DeepgramModel.__name__ + '" instance creation..')
114+
model = DeepgramModel(**kwargs)
115+
116+
app = init_app(model_class=DeepgramModel, basic_auth_user=args.basic_auth_user, basic_auth_pass=args.basic_auth_pass)
117+
118+
app.run(host=args.host, port=args.port, debug=args.debug)
119+
120+
else:
121+
# for uWSGI use
122+
app = init_app(model_class=DeepgramModel)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
services:
2+
deepgram:
3+
container_name: ml-backend
4+
image: humansignal/ml-backend:v0
5+
build:
6+
context: .
7+
args:
8+
TEST_ENV: ${TEST_ENV}
9+
10+
# deploy:
11+
# resources:
12+
# reservations:
13+
# devices:
14+
# - driver: nvidia
15+
# count: 1
16+
# capabilities: [ gpu ]
17+
18+
19+
environment:
20+
# specify these parameters if you want to use basic auth for the model server
21+
- BASIC_AUTH_USER=
22+
- BASIC_AUTH_PASS=
23+
# set the log level for the model server
24+
- LOG_LEVEL=DEBUG
25+
# any other parameters that you want to pass to the model server
26+
- ANY=PARAMETER
27+
# specify the number of workers and threads for the model server
28+
- WORKERS=1
29+
- THREADS=8
30+
# specify the model directory (likely you don't need to change this)
31+
- MODEL_DIR=/data/models
32+
# specify device
33+
- DEEPGRAM_API_KEY=
34+
35+
# For AWS upload
36+
- AWS_ACCESS_KEY_ID=
37+
- AWS_SECRET_ACCESS_KEY=
38+
- AWS_SESSION_TOKEN=
39+
- AWS_DEFAULT_REGION=us-east-1
40+
- S3_BUCKET=
41+
- S3_FOLDER=
42+
43+
# Specify the Label Studio URL and API key to access
44+
# uploaded, local storage and cloud storage files.
45+
# Do not use 'localhost' as it does not work within Docker containers.
46+
# Use prefix 'http://' or 'https://' for the URL always.
47+
# Determine the actual IP using 'ifconfig' (Linux/Mac) or 'ipconfig' (Windows).
48+
- LABEL_STUDIO_URL=
49+
- LABEL_STUDIO_API_KEY=
50+
ports:
51+
- "9090:9090"
52+
volumes:
53+
- "./data/server:/data"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"data": {"audio": "", "text": ""}
3+
}

0 commit comments

Comments
 (0)