docs(recipes): add example of talking to audio file (#1794)

collindutter · web-flow · commit fe27585cf81d · 2025-03-05T21:43:39.000Z
diff --git a/docs/examples/src/talk_to_an_audio_1.py b/docs/examples/src/talk_to_an_audio_1.py
@@ -0,0 +1,22 @@
+from typing import cast
+
+from griptape.artifacts.audio_artifact import AudioArtifact
+from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
+from griptape.loaders import AudioLoader
+from griptape.tasks import PromptTask
+
+prompt_driver = OpenAiChatPromptDriver(
+    model="gpt-4o-audio-preview",
+    modalities=["audio", "text"],
+    audio={"voice": "sage", "format": "mp3"},
+)
+audio_loader = AudioLoader()
+task = PromptTask(prompt_driver=prompt_driver)
+
+audio_file = audio_loader.load("tests/resources/audio.mp3")
+result = cast(AudioArtifact, task.run(["Transcribe this audio but like a pirate", audio_file]))
+audio_loader.save("pirate_audio.mp3", result)
+print(result.meta["transcript"])
+
+result = cast(AudioArtifact, task.run(["What is the tone of the person speaking?", audio_file]))
+print(result.meta["transcript"])
diff --git a/docs/examples/src/talk_to_an_audio_2.py b/docs/examples/src/talk_to_an_audio_2.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import base64
+from typing import TYPE_CHECKING, Optional
+
+import attrs
+import pyaudio  # pyright: ignore[reportMissingModuleSource]
+
+from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
+from griptape.events.audio_chunk_event import AudioChunkEvent
+from griptape.structures.agent import Agent
+
+if TYPE_CHECKING:
+    from types import TracebackType
+
+
+@attrs.define
+class AudioPlayer:
+    """Simple audio player using PyAudio."""
+
+    format: int = attrs.field(default=pyaudio.paInt16)
+    channels: int = attrs.field(default=1)
+    rate: int = attrs.field(default=24000)
+    chunk_size: int = attrs.field(default=1024)
+
+    audio: pyaudio.PyAudio = attrs.field(default=attrs.Factory(lambda: pyaudio.PyAudio()))
+    stream: pyaudio.Stream = attrs.field(init=False)
+
+    def __enter__(self) -> AudioPlayer:
+        self.stream = self.audio.open(
+            format=self.format,
+            channels=self.channels,
+            rate=self.rate,
+            output=True,
+            frames_per_buffer=self.chunk_size,
+        )
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_value: Optional[BaseException],
+        exc_traceback: Optional[TracebackType],
+    ) -> None:
+        self.close()
+
+    def write(self, audio_bytes: bytes) -> None:
+        """Write audio bytes to the audio player. i.e. play the audio."""
+        for i in range(0, len(audio_bytes), self.chunk_size):
+            chunk = audio_bytes[i : i + self.chunk_size]
+            self.stream.write(chunk)
+
+    def close(self) -> None:
+        """Close the audio player and terminate resources."""
+        if self.stream:
+            self.stream.stop_stream()
+            self.stream.close()
+        self.audio.terminate()
+
+
+agent = Agent(
+    prompt_driver=OpenAiChatPromptDriver(
+        model="gpt-4o-audio-preview",
+        modalities=["audio", "text"],
+        audio={"voice": "sage", "format": "pcm16"},
+        stream=True,
+    )
+)
+
+
+with AudioPlayer() as audio_player:
+    for event in agent.run_stream("Hi there"):
+        if isinstance(event, AudioChunkEvent):
+            audio_player.write(base64.b64decode(event.data))
diff --git a/docs/examples/talk-to-an-audio.md b/docs/examples/talk-to-an-audio.md
@@ -0,0 +1,30 @@
+Certain models are capable of handling more modalities than text.
+OpenAI's `gpt-4o-audio-preview`, for instance, can accept and produce both text as well as audio.
+In this example, we'll use OpenAI's [gpt-4o-audio-preview](https://platform.openai.com/docs/guides/audio) model to re-transcribe an audio file as a pirate, and then determine the tone of the speaker.
+
+!!! important
+
+    `modalities=["audio", "text"]` must be provided to use this model.
+
+!!! tip
+
+    Try playing around with the available [voice options](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
+
+```python
+--8<-- "docs/examples/src/talk_to_an_audio_1.py"
+```
+
+!!! note
+
+    [Text To Speech Drivers](../griptape-framework/drivers/text-to-speech-drivers.md) and [Audio Transcription Drivers](../griptape-framework/drivers/audio-transcription-drivers.md) may provide a more performant, cost-effective solution.
+
+We can also stream back responses in real-time for a more interactive, conversational experience.
+Although playing audio streams isn't a core `griptape` feature, we can implement a simple `AudioPlayer` utility with `pyaudio` to demonstrate streaming audio playback.
+
+!!! important
+
+    Griptape does not include `pyaudio` as a dependency. See `pyaudio`'s [installation instructions](https://pypi.org/project/PyAudio/) for details.
+
+```python
+--8<-- "docs/examples/src/talk_to_an_audio_2.py"
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -170,6 +170,7 @@ nav:
           - Talk to a Video: "examples/talk-to-a-video.md"
           - Talk to an Image: "examples/talk-to-an-image.md"
           - Talk to a Document: "examples/talk-to-a-document.md"
+          - Talk to an Audio: "examples/talk-to-an-audio.md"
           - Multi Agent Workflows: "examples/multi-agent-workflow.md"
           - Shared Memory Between Agents: "examples/multiple-agent-shared-memory.md"
           - Chat Sessions with Amazon DynamoDB: "examples/amazon-dynamodb-sessions.md"
diff --git a/tests/integration/test_code_blocks.py b/tests/integration/test_code_blocks.py
@@ -22,6 +22,7 @@
     "docs/griptape-framework/structures/src/observability_1.py",
     "docs/griptape-framework/structures/src/observability_2.py",
     "docs/griptape-framework/data/src/loaders_9.py",
+    "docs/examples/src/talk_to_an_audio_2.py",
 ]
 
 
diff --git a/tests/resources/audio.mp3 b/tests/resources/audio.mp3
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`"docs/griptape-framework/structures/src/observability_1.py",`
`23`	`23`	`"docs/griptape-framework/structures/src/observability_2.py",`
`24`	`24`	`"docs/griptape-framework/data/src/loaders_9.py",`
	`25`	`+ "docs/examples/src/talk_to_an_audio_2.py",`
`25`	`26`	`]`
`26`	`27`
`27`	`28`