Skip to content

Commit fe27585

Browse files
authored
docs(recipes): add example of talking to audio file (#1794)
1 parent 2d0238d commit fe27585

File tree

7 files changed

+130
-0
lines changed

7 files changed

+130
-0
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from typing import cast
2+
3+
from griptape.artifacts.audio_artifact import AudioArtifact
4+
from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
5+
from griptape.loaders import AudioLoader
6+
from griptape.tasks import PromptTask
7+
8+
prompt_driver = OpenAiChatPromptDriver(
9+
model="gpt-4o-audio-preview",
10+
modalities=["audio", "text"],
11+
audio={"voice": "sage", "format": "mp3"},
12+
)
13+
audio_loader = AudioLoader()
14+
task = PromptTask(prompt_driver=prompt_driver)
15+
16+
audio_file = audio_loader.load("tests/resources/audio.mp3")
17+
result = cast(AudioArtifact, task.run(["Transcribe this audio but like a pirate", audio_file]))
18+
audio_loader.save("pirate_audio.mp3", result)
19+
print(result.meta["transcript"])
20+
21+
result = cast(AudioArtifact, task.run(["What is the tone of the person speaking?", audio_file]))
22+
print(result.meta["transcript"])
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from __future__ import annotations
2+
3+
import base64
4+
from typing import TYPE_CHECKING, Optional
5+
6+
import attrs
7+
import pyaudio # pyright: ignore[reportMissingModuleSource]
8+
9+
from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
10+
from griptape.events.audio_chunk_event import AudioChunkEvent
11+
from griptape.structures.agent import Agent
12+
13+
if TYPE_CHECKING:
14+
from types import TracebackType
15+
16+
17+
@attrs.define
18+
class AudioPlayer:
19+
"""Simple audio player using PyAudio."""
20+
21+
format: int = attrs.field(default=pyaudio.paInt16)
22+
channels: int = attrs.field(default=1)
23+
rate: int = attrs.field(default=24000)
24+
chunk_size: int = attrs.field(default=1024)
25+
26+
audio: pyaudio.PyAudio = attrs.field(default=attrs.Factory(lambda: pyaudio.PyAudio()))
27+
stream: pyaudio.Stream = attrs.field(init=False)
28+
29+
def __enter__(self) -> AudioPlayer:
30+
self.stream = self.audio.open(
31+
format=self.format,
32+
channels=self.channels,
33+
rate=self.rate,
34+
output=True,
35+
frames_per_buffer=self.chunk_size,
36+
)
37+
return self
38+
39+
def __exit__(
40+
self,
41+
exc_type: Optional[type[BaseException]],
42+
exc_value: Optional[BaseException],
43+
exc_traceback: Optional[TracebackType],
44+
) -> None:
45+
self.close()
46+
47+
def write(self, audio_bytes: bytes) -> None:
48+
"""Write audio bytes to the audio player. i.e. play the audio."""
49+
for i in range(0, len(audio_bytes), self.chunk_size):
50+
chunk = audio_bytes[i : i + self.chunk_size]
51+
self.stream.write(chunk)
52+
53+
def close(self) -> None:
54+
"""Close the audio player and terminate resources."""
55+
if self.stream:
56+
self.stream.stop_stream()
57+
self.stream.close()
58+
self.audio.terminate()
59+
60+
61+
agent = Agent(
62+
prompt_driver=OpenAiChatPromptDriver(
63+
model="gpt-4o-audio-preview",
64+
modalities=["audio", "text"],
65+
audio={"voice": "sage", "format": "pcm16"},
66+
stream=True,
67+
)
68+
)
69+
70+
71+
with AudioPlayer() as audio_player:
72+
for event in agent.run_stream("Hi there"):
73+
if isinstance(event, AudioChunkEvent):
74+
audio_player.write(base64.b64decode(event.data))

docs/examples/talk-to-an-audio.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
Certain models are capable of handling more modalities than text.
2+
OpenAI's `gpt-4o-audio-preview`, for instance, can accept and produce both text as well as audio.
3+
In this example, we'll use OpenAI's [gpt-4o-audio-preview](https://platform.openai.com/docs/guides/audio) model to re-transcribe an audio file as a pirate, and then determine the tone of the speaker.
4+
5+
!!! important
6+
7+
`modalities=["audio", "text"]` must be provided to use this model.
8+
9+
!!! tip
10+
11+
Try playing around with the available [voice options](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
12+
13+
```python
14+
--8<-- "docs/examples/src/talk_to_an_audio_1.py"
15+
```
16+
17+
!!! note
18+
19+
[Text To Speech Drivers](../griptape-framework/drivers/text-to-speech-drivers.md) and [Audio Transcription Drivers](../griptape-framework/drivers/audio-transcription-drivers.md) may provide a more performant, cost-effective solution.
20+
21+
We can also stream back responses in real-time for a more interactive, conversational experience.
22+
Although playing audio streams isn't a core `griptape` feature, we can implement a simple `AudioPlayer` utility with `pyaudio` to demonstrate streaming audio playback.
23+
24+
!!! important
25+
26+
Griptape does not include `pyaudio` as a dependency. See `pyaudio`'s [installation instructions](https://pypi.org/project/PyAudio/) for details.
27+
28+
```python
29+
--8<-- "docs/examples/src/talk_to_an_audio_2.py"
30+
```

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ nav:
170170
- Talk to a Video: "examples/talk-to-a-video.md"
171171
- Talk to an Image: "examples/talk-to-an-image.md"
172172
- Talk to a Document: "examples/talk-to-a-document.md"
173+
- Talk to an Audio: "examples/talk-to-an-audio.md"
173174
- Multi Agent Workflows: "examples/multi-agent-workflow.md"
174175
- Shared Memory Between Agents: "examples/multiple-agent-shared-memory.md"
175176
- Chat Sessions with Amazon DynamoDB: "examples/amazon-dynamodb-sessions.md"

tests/integration/test_code_blocks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"docs/griptape-framework/structures/src/observability_1.py",
2323
"docs/griptape-framework/structures/src/observability_2.py",
2424
"docs/griptape-framework/data/src/loaders_9.py",
25+
"docs/examples/src/talk_to_an_audio_2.py",
2526
]
2627

2728

tests/resources/audio.mp3

90.5 KB
Binary file not shown.

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)