diff --git a/examples/160-llamaindex-audio-loader-python/.env.example b/examples/160-llamaindex-audio-loader-python/.env.example new file mode 100644 index 0000000..2942f44 --- /dev/null +++ b/examples/160-llamaindex-audio-loader-python/.env.example @@ -0,0 +1,6 @@ +# Deepgram — https://console.deepgram.com/ +DEEPGRAM_API_KEY= + +# OpenAI — used by LlamaIndex default LLM and embeddings for querying the index +# https://platform.openai.com/api-keys +OPENAI_API_KEY= diff --git a/examples/160-llamaindex-audio-loader-python/README.md b/examples/160-llamaindex-audio-loader-python/README.md new file mode 100644 index 0000000..3301655 --- /dev/null +++ b/examples/160-llamaindex-audio-loader-python/README.md @@ -0,0 +1,71 @@ +# LlamaIndex Audio Document Loader — Transcribe Audio into RAG Pipelines + +Use Deepgram speech-to-text and Audio Intelligence to turn audio files into LlamaIndex Documents. Load podcasts, meetings, or lectures into a vector index and query them with natural language — all in a few lines of Python. + +## What you'll build + +A custom LlamaIndex `BaseReader` that transcribes audio URLs via Deepgram nova-3, enriches each Document with Audio Intelligence metadata (summary, topics, sentiment, entities), and feeds everything into a `VectorStoreIndex` for RAG-powered Q&A. + +## Prerequisites + +- Python 3.10+ +- Deepgram account — [get a free API key](https://console.deepgram.com/) +- OpenAI account (for query mode) — [get an API key](https://platform.openai.com/api-keys) + +## Environment variables + +| Variable | Where to find it | Required for | +|----------|-----------------|-------------| +| `DEEPGRAM_API_KEY` | [Deepgram console](https://console.deepgram.com/) | Both modes | +| `OPENAI_API_KEY` | [OpenAI dashboard](https://platform.openai.com/api-keys) | Query mode only | + +Copy `.env.example` to `.env` and fill in your values. + +## Install and run + +```bash +pip install -r requirements.txt + +# Load audio into Documents — prints transcript and metadata +python src/audio_loader.py https://dpgr.am/spacewalk.wav + +# Query mode — ask a question about the audio content +python src/audio_loader.py --query "What was the main topic discussed?" https://dpgr.am/spacewalk.wav +``` + +## Key parameters + +| Parameter | Value | Description | +|-----------|-------|-------------| +| `model` | `nova-3` | Deepgram's latest and most accurate STT model | +| `smart_format` | `True` | Adds punctuation, capitalisation, and number formatting | +| `summarize` | `"v2"` | Generates a short summary of the audio content | +| `topics` | `True` | Detects topics discussed in the audio | +| `sentiment` | `True` | Analyses overall sentiment of the content | +| `detect_entities` | `True` | Extracts named entities (people, places, orgs) | + +## How it works + +1. `DeepgramAudioReader` implements LlamaIndex's `BaseReader` interface with a `load_data()` method +2. For each audio URL, it calls Deepgram's pre-recorded API (`transcribe_url`) with Audio Intelligence features enabled — Deepgram fetches the audio server-side +3. The transcript becomes `Document.text`; intelligence results (summary, topics, sentiment, entities) become `Document.metadata` +4. In query mode, the Documents are embedded via OpenAI and stored in a `VectorStoreIndex` for similarity search and LLM-powered answers + +## Extending this example + +- **Multiple audio files** — pass several URLs to build an index across many recordings +- **Custom metadata filters** — use LlamaIndex metadata filters to query only documents with specific topics or sentiment +- **Swap the vector store** — replace the in-memory default with Chroma, Pinecone, or Weaviate +- **Speaker diarization** — add `diarize=True` to split transcripts by speaker + +## Related + +- [Deepgram pre-recorded STT docs](https://developers.deepgram.com/docs/pre-recorded-audio) +- [Deepgram Audio Intelligence docs](https://developers.deepgram.com/docs/audio-intelligence) +- [Deepgram Python SDK](https://github.com/deepgram/deepgram-python-sdk) +- [LlamaIndex custom data loaders](https://docs.llamaindex.ai/en/stable/module_guides/loading/connector/) +- [LlamaIndex VectorStoreIndex](https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/) + +## Starter templates + +If you want a ready-to-run base for your own project, check the [deepgram-starters](https://github.com/orgs/deepgram-starters/repositories) org — there are starter repos for every language and every Deepgram product. diff --git a/examples/160-llamaindex-audio-loader-python/requirements.txt b/examples/160-llamaindex-audio-loader-python/requirements.txt new file mode 100644 index 0000000..a2aab4b --- /dev/null +++ b/examples/160-llamaindex-audio-loader-python/requirements.txt @@ -0,0 +1,5 @@ +deepgram-sdk>=3.0.0 +llama-index-core>=0.12.0 +llama-index-llms-openai>=0.4.0 +llama-index-embeddings-openai>=0.3.0 +python-dotenv>=1.0.0 diff --git a/examples/160-llamaindex-audio-loader-python/src/audio_loader.py b/examples/160-llamaindex-audio-loader-python/src/audio_loader.py new file mode 100644 index 0000000..17693f3 --- /dev/null +++ b/examples/160-llamaindex-audio-loader-python/src/audio_loader.py @@ -0,0 +1,201 @@ +"""LlamaIndex reader that transcribes audio via Deepgram and returns Documents. + +Usage: + # Load audio into LlamaIndex Documents and query them + python src/audio_loader.py https://dpgr.am/spacewalk.wav + + # Query mode — ask a question about the audio content + python src/audio_loader.py --query "What is the main topic?" https://dpgr.am/spacewalk.wav +""" + +import os +import sys +from pathlib import Path +from typing import List, Optional + +from dotenv import load_dotenv + +load_dotenv() + +# SDK v5 Python: DeepgramClient reads DEEPGRAM_API_KEY from env automatically. +from deepgram import DeepgramClient + +# LlamaIndex core: Document is the atomic unit of data, BaseReader defines +# the load_data() contract that all readers/loaders implement. +from llama_index.core import VectorStoreIndex +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + + +class DeepgramAudioReader(BaseReader): + """Transcribes audio files using Deepgram and returns LlamaIndex Documents. + + Each audio URL becomes one Document whose text is the transcript. + Deepgram Audio Intelligence results (summary, topics, sentiment) are + attached as document metadata for filtering and enrichment in RAG pipelines. + """ + + def __init__( + self, + model: str = "nova-3", + smart_format: bool = True, + summarize: Optional[str] = "v2", + topics: bool = True, + sentiment: bool = True, + detect_entities: bool = True, + language: str = "en", + ) -> None: + self.model = model + self.smart_format = smart_format + self.summarize = summarize + self.topics = topics + self.sentiment = sentiment + self.detect_entities = detect_entities + self.language = language + self._client = DeepgramClient() + + def load_data(self, audio_urls: List[str]) -> List[Document]: + """Transcribe each audio URL and return a list of Documents. + + This follows the same pattern as llama-index-readers-assemblyai: + audio in → transcription API → Document objects out. + """ + documents = [] + for url in audio_urls: + doc = self._transcribe_url(url) + documents.append(doc) + return documents + + def _transcribe_url(self, url: str) -> Document: + """Transcribe a single audio URL and build a Document with metadata.""" + # ← transcribe_url has Deepgram fetch the audio server-side + response = self._client.listen.v1.media.transcribe_url( + url=url, + model=self.model, + smart_format=self.smart_format, + # Audio Intelligence features run on the same transcription call — + # they are parameters, not separate endpoints. + summarize=self.summarize, + topics=self.topics, + sentiment=self.sentiment, + detect_entities=self.detect_entities, + language=self.language, + ) + + # response.results.channels[0].alternatives[0].transcript + channel = response.results.channels[0] + alt = channel.alternatives[0] + transcript = alt.transcript + confidence = alt.confidence + words = alt.words + duration = words[-1].end if words else 0.0 + + metadata = { + "source": url, + "duration_seconds": duration, + "confidence": confidence, + "model": self.model, + "language": self.language, + } + + # Audio Intelligence results live at response.results.{feature} + summary = getattr(response.results, "summary", None) + if summary and hasattr(summary, "short"): + metadata["summary"] = summary.short + + topics_result = getattr(response.results, "topics", None) + if topics_result and hasattr(topics_result, "segments"): + topic_list = [] + for segment in topics_result.segments: + for topic in getattr(segment, "topics", []): + if hasattr(topic, "topic"): + topic_list.append(topic.topic) + metadata["topics"] = list(dict.fromkeys(topic_list)) + + sentiments_result = getattr(response.results, "sentiments", None) + if sentiments_result and hasattr(sentiments_result, "average"): + metadata["average_sentiment"] = sentiments_result.average.sentiment + + entities_result = getattr(response.results, "entities", None) + if entities_result and hasattr(entities_result, "segments"): + entity_list = [] + for segment in entities_result.segments: + if hasattr(segment, "value"): + entity_list.append(f"{segment.entity_type}: {segment.value}") + metadata["entities"] = list(dict.fromkeys(entity_list)) + + return Document(text=transcript, metadata=metadata) + + +def run_load(audio_urls: List[str]) -> None: + """Load audio into Documents and print their content and metadata.""" + reader = DeepgramAudioReader() + documents = reader.load_data(audio_urls) + + for i, doc in enumerate(documents): + print(f"\n{'='*60}") + print(f"Document {i+1}") + print(f"{'='*60}") + print(f"Source: {doc.metadata.get('source', 'unknown')}") + print(f"Duration: {doc.metadata.get('duration_seconds', 0):.1f}s") + print(f"Confidence: {doc.metadata.get('confidence', 0):.0%}") + if "summary" in doc.metadata: + print(f"Summary: {doc.metadata['summary']}") + if "topics" in doc.metadata: + print(f"Topics: {', '.join(doc.metadata['topics'][:5])}") + if "entities" in doc.metadata: + print(f"Entities: {', '.join(doc.metadata['entities'][:5])}") + print(f"\nTranscript preview:\n {doc.text[:300]}...") + + +def run_query(audio_urls: List[str], question: str) -> None: + """Load audio, build a VectorStoreIndex, and query it. + + This demonstrates the full RAG pipeline: audio → Deepgram → Documents → + embeddings → vector index → LLM-powered query. + Requires OPENAI_API_KEY for LlamaIndex default LLM and embeddings. + """ + if not os.environ.get("OPENAI_API_KEY"): + print("Error: OPENAI_API_KEY is not set.", file=sys.stderr) + print("The query engine needs an LLM. Get a key at https://platform.openai.com/api-keys", file=sys.stderr) + sys.exit(1) + + reader = DeepgramAudioReader() + documents = reader.load_data(audio_urls) + + print(f"Loaded {len(documents)} document(s), building index...") + + # VectorStoreIndex embeds the documents and stores them for similarity search. + # Default uses OpenAI text-embedding-ada-002 for embeddings and gpt-3.5-turbo for queries. + index = VectorStoreIndex.from_documents(documents) + query_engine = index.as_query_engine() + + response = query_engine.query(question) + + print(f"\n{'='*60}") + print(f"Question: {question}") + print(f"{'='*60}") + print(f"\n{response}") + + +def main() -> None: + if len(sys.argv) < 2: + print("Usage:") + print(" python src/audio_loader.py [ ...]") + print(" python src/audio_loader.py --query 'Your question' [ ...]") + sys.exit(1) + + if sys.argv[1] == "--query": + if len(sys.argv) < 4: + print("Error: provide a question and at least one audio URL", file=sys.stderr) + sys.exit(1) + question = sys.argv[2] + audio_urls = sys.argv[3:] + run_query(audio_urls, question) + else: + audio_urls = sys.argv[1:] + run_load(audio_urls) + + +if __name__ == "__main__": + main() diff --git a/examples/160-llamaindex-audio-loader-python/tests/test_example.py b/examples/160-llamaindex-audio-loader-python/tests/test_example.py new file mode 100644 index 0000000..27f8b47 --- /dev/null +++ b/examples/160-llamaindex-audio-loader-python/tests/test_example.py @@ -0,0 +1,118 @@ +import os +import sys +from pathlib import Path + +# ── Credential check ──────────────────────────────────────────────────────── +# Exit code convention across all examples in this repo: +# 0 = all tests passed +# 1 = real test failure (code bug, assertion error, unexpected API response) +# 2 = missing credentials (expected in CI until secrets are configured) +env_example = Path(__file__).parent.parent / ".env.example" +required = [ + line.split("=")[0].strip() + for line in env_example.read_text().splitlines() + if line and not line.startswith("#") and "=" in line and line[0].isupper() +] +missing = [k for k in required if not os.environ.get(k)] +if missing: + print(f"MISSING_CREDENTIALS: {','.join(missing)}", file=sys.stderr) + sys.exit(2) +# ──────────────────────────────────────────────────────────────────────────── + +from deepgram import DeepgramClient + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +from audio_loader import DeepgramAudioReader + +AUDIO_URL = "https://dpgr.am/spacewalk.wav" + + +def test_deepgram_stt(): + """Verify the Deepgram API key works and nova-3 returns a transcript.""" + client = DeepgramClient() + response = client.listen.v1.media.transcribe_url( + url=AUDIO_URL, + model="nova-3", + smart_format=True, + ) + transcript = response.results.channels[0].alternatives[0].transcript + assert len(transcript) > 10, "Transcript too short" + + lower = transcript.lower() + expected = ["spacewalk", "astronaut", "nasa"] + found = [w for w in expected if w in lower] + assert len(found) > 0, f"Expected keywords not found in: {transcript[:200]}" + + print("✓ Deepgram STT integration working") + print(f" Transcript preview: '{transcript[:80]}...'") + + +def test_audio_reader_load_data(): + """Verify DeepgramAudioReader returns Documents with transcript and metadata.""" + reader = DeepgramAudioReader() + documents = reader.load_data([AUDIO_URL]) + + assert len(documents) == 1, f"Expected 1 document, got {len(documents)}" + + doc = documents[0] + assert len(doc.text) > 10, "Document text too short" + assert doc.metadata.get("source") == AUDIO_URL, "Source metadata missing" + assert doc.metadata.get("confidence", 0) > 0.5, "Confidence too low" + assert doc.metadata.get("duration_seconds", 0) > 0, "Duration missing" + assert doc.metadata.get("model") == "nova-3", "Model metadata incorrect" + + lower = doc.text.lower() + expected = ["spacewalk", "astronaut", "nasa"] + found = [w for w in expected if w in lower] + assert len(found) > 0, f"Expected keywords not found in document text: {doc.text[:200]}" + + print("✓ DeepgramAudioReader load_data working") + print(f" Document text length: {len(doc.text)} chars") + print(f" Metadata keys: {list(doc.metadata.keys())}") + + +def test_audio_reader_intelligence_metadata(): + """Verify Audio Intelligence features populate document metadata.""" + reader = DeepgramAudioReader( + summarize="v2", + topics=True, + sentiment=True, + detect_entities=True, + ) + documents = reader.load_data([AUDIO_URL]) + doc = documents[0] + + has_intelligence = any( + k in doc.metadata for k in ["summary", "topics", "entities", "average_sentiment"] + ) + assert has_intelligence, ( + f"No Audio Intelligence metadata found. Keys: {list(doc.metadata.keys())}" + ) + + print("✓ Audio Intelligence metadata populated") + if "summary" in doc.metadata: + print(f" Summary: {doc.metadata['summary'][:100]}...") + if "topics" in doc.metadata: + print(f" Topics: {doc.metadata['topics'][:3]}") + + +def test_document_is_indexable(): + """Verify the Document objects work with LlamaIndex VectorStoreIndex.""" + from llama_index.core.schema import Document as LIDocument + + reader = DeepgramAudioReader() + documents = reader.load_data([AUDIO_URL]) + doc = documents[0] + + assert isinstance(doc, LIDocument), "Document is not a LlamaIndex Document" + assert doc.get_content() == doc.text, "get_content() should return text" + assert doc.metadata is not None, "Document should have metadata" + + print("✓ Documents are valid LlamaIndex Document objects") + + +if __name__ == "__main__": + test_deepgram_stt() + test_audio_reader_load_data() + test_audio_reader_intelligence_metadata() + test_document_is_indexable()