feat: add support for gpt-image-1 (#1921)

collindutter · web-flow · commit 2a6f2a9713a4 · 2025-05-05T12:34:00.000-07:00
diff --git a/docs/griptape-framework/drivers/image-generation-drivers.md b/docs/griptape-framework/drivers/image-generation-drivers.md
@@ -113,7 +113,7 @@ This Driver supports negative prompts. When provided, the image generation reque
 
 The [OpenAI Image Generation Driver](../../reference/griptape/drivers/image_generation/openai_image_generation_driver.md) provides access to OpenAI image generation models. Like other OpenAI Drivers, the image generation Driver will implicitly load an API key in the `OPENAI_API_KEY` environment variable if one is not explicitly provided.
 
-This Driver supports image generation configurations like style presets, image quality preference, and image size. For details on supported configuration values, see the [OpenAI documentation](https://platform.openai.com/docs/guides/images/introduction).
+This Driver supports image generation configurations like style presets, image quality preference, and image size. For details on supported configuration values, see the [OpenAI documentation](https://platform.openai.com/docs/guides/image-generation).
 
 === "Code"
 
diff --git a/docs/griptape-framework/drivers/src/image_generation_drivers_1.py b/docs/griptape-framework/drivers/src/image_generation_drivers_1.py
@@ -3,7 +3,7 @@
 from griptape.tools import PromptImageGenerationTool
 
 driver = OpenAiImageGenerationDriver(
-    model="dall-e-2",
+    model="gpt-image-1",
 )
 
 agent = Agent(
diff --git a/docs/griptape-framework/drivers/src/image_generation_drivers_6.py b/docs/griptape-framework/drivers/src/image_generation_drivers_6.py
@@ -1,17 +1,9 @@
 from griptape.drivers.image_generation.openai import OpenAiImageGenerationDriver
 from griptape.structures import Agent
-from griptape.tools import PromptImageGenerationTool
+from griptape.tools import FileManagerTool, PromptImageGenerationTool
 
-driver = OpenAiImageGenerationDriver(
-    model="dall-e-2",
-    image_size="512x512",
-)
+driver = OpenAiImageGenerationDriver(model="gpt-image-1")
 
+agent = Agent(tools=[PromptImageGenerationTool(image_generation_driver=driver, off_prompt=True), FileManagerTool()])
 
-agent = Agent(
-    tools=[
-        PromptImageGenerationTool(image_generation_driver=driver),
-    ]
-)
-
-agent.run("Generate a watercolor painting of a dog riding a skateboard")
+agent.run("Generate a watercolor painting of a dog riding a skateboard and save it to dog.png")
diff --git a/griptape/drivers/image_generation/openai_image_generation_driver.py b/griptape/drivers/image_generation/openai_image_generation_driver.py
@@ -1,18 +1,19 @@
 from __future__ import annotations
 
 import base64
-from typing import TYPE_CHECKING, Literal, Optional, cast
+from typing import TYPE_CHECKING, Literal, Optional
 
 import openai
-from attrs import define, field
+from attrs import define, field, fields_dict
 
-from griptape.artifacts import ImageArtifact
 from griptape.drivers.image_generation import BaseImageGenerationDriver
 from griptape.utils.decorators import lazy_property
 
 if TYPE_CHECKING:
     from openai.types.images_response import ImagesResponse
 
+    from griptape.artifacts import ImageArtifact
+
 
 @define
 class OpenAiImageGenerationDriver(BaseImageGenerationDriver):
@@ -32,49 +33,106 @@ class OpenAiImageGenerationDriver(BaseImageGenerationDriver):
             dall-e-3: [1024x1024, 1024x1792, 1792x1024]
         response_format: The response format. Currently only supports 'b64_json' which will return
             a base64 encoded image in a JSON object.
+        background: Optional and only supported for gpt-image-1. Can be either 'transparent', 'opaque', or 'auto'.
+        moderation: Optional and only supported for gpt-image-1. Can be either 'low' or 'auto'.
+        output_compression: Optional and only supported for gpt-image-1. Can be an integer between 0 and 100.
+        output_format: Optional and only supported for gpt-image-1. Can be either 'png' or 'jpeg'.
     """
 
     api_type: Optional[str] = field(default=openai.api_type, kw_only=True)
     api_version: Optional[str] = field(default=openai.api_version, kw_only=True, metadata={"serializable": True})
     base_url: Optional[str] = field(default=None, kw_only=True, metadata={"serializable": True})
     api_key: Optional[str] = field(default=None, kw_only=True, metadata={"serializable": False})
     organization: Optional[str] = field(default=openai.organization, kw_only=True, metadata={"serializable": True})
-    style: Optional[str] = field(default=None, kw_only=True, metadata={"serializable": True})
-    quality: Literal["standard", "hd"] = field(
-        default="standard",
+    style: Optional[Literal["vivid", "natural"]] = field(
+        default=None, kw_only=True, metadata={"serializable": True, "model_allowlist": ["dall-e-3"]}
+    )
+    quality: Optional[Literal["standard", "hd", "low", "medium", "high", "auto"]] = field(
+        default=None,
+        kw_only=True,
+        metadata={"serializable": True},
+    )
+    image_size: Optional[Literal["256x256", "512x512", "1024x1024", "1024x1792", "1792x1024"]] = field(
+        default=None,
         kw_only=True,
         metadata={"serializable": True},
     )
-    image_size: Literal["256x256", "512x512", "1024x1024", "1024x1792", "1792x1024"] = field(
-        default="1024x1024", kw_only=True, metadata={"serializable": True}
+    response_format: Literal["b64_json"] = field(
+        default="b64_json",
+        kw_only=True,
+        metadata={"serializable": True, "model_denylist": ["gpt-image-1"]},
+    )
+    background: Optional[Literal["transparent", "opaque", "auto"]] = field(
+        default=None,
+        kw_only=True,
+        metadata={"serializable": True, "model_allowlist": ["gpt-image-1"]},
+    )
+    moderation: Optional[Literal["low", "auto"]] = field(
+        default=None,
+        kw_only=True,
+        metadata={"serializable": True, "model_allowlist": ["gpt-image-1"]},
+    )
+    output_compression: Optional[int] = field(
+        default=None,
+        kw_only=True,
+        metadata={"serializable": True, "model_allowlist": ["gpt-image-1"]},
+    )
+    output_format: Optional[Literal["png", "jpeg"]] = field(
+        default=None,
+        kw_only=True,
+        metadata={"serializable": True, "model_allowlist": ["gpt-image-1"]},
     )
-    response_format: Literal["b64_json"] = field(default="b64_json", kw_only=True, metadata={"serializable": True})
     _client: Optional[openai.OpenAI] = field(
         default=None, kw_only=True, alias="client", metadata={"serializable": False}
     )
 
+    @image_size.validator  # pyright: ignore[reportAttributeAccessIssue, reportOptionalMemberAccess]
+    def validate_image_size(self, attribute: str, value: str | None) -> None:
+        """Validates the image size based on the model.
+
+        Must be one of `1024x1024`, `1536x1024` (landscape), `1024x1536` (portrait), or `auto` (default value) for
+        `gpt-image-1`, one of `256x256`, `512x512`, or `1024x1024` for `dall-e-2`, and
+        one of `1024x1024`, `1792x1024`, or `1024x1792` for `dall-e-3`.
+
+        """
+        if value is None:
+            return
+
+        if self.model.startswith("gpt-image"):
+            allowed_sizes = ("1024x1024", "1536x1024", "1024x1536", "auto")
+        elif self.model == "dall-e-2":
+            allowed_sizes = ("256x256", "512x512", "1024x1024")
+        elif self.model == "dall-e-3":
+            allowed_sizes = ("1024x1024", "1792x1024", "1024x1792")
+        else:
+            raise NotImplementedError(f"Image size validation not implemented for model {self.model}")
+
+        if value is not None and value not in allowed_sizes:
+            raise ValueError(f"Image size, {value}, must be one of the following: {allowed_sizes}")
+
     @lazy_property()
     def client(self) -> openai.OpenAI:
         return openai.OpenAI(api_key=self.api_key, base_url=self.base_url, organization=self.organization)
 
     def try_text_to_image(self, prompts: list[str], negative_prompts: Optional[list[str]] = None) -> ImageArtifact:
         prompt = ", ".join(prompts)
 
-        additional_params = {}
-
-        if self.style:
-            additional_params["style"] = self.style
-
-        if self.quality:
-            additional_params["quality"] = self.quality
-
         response = self.client.images.generate(
             model=self.model,
             prompt=prompt,
-            size=self.image_size,
-            response_format=self.response_format,
             n=1,
-            **additional_params,
+            **self._build_model_params(
+                {
+                    "size": "image_size",
+                    "quality": "quality",
+                    "style": "style",
+                    "response_format": "response_format",
+                    "background": "background",
+                    "moderation": "moderation",
+                    "output_compression": "output_compression",
+                    "output_format": "output_format",
+                }
+            ),
         )
 
         return self._parse_image_response(response, prompt)
@@ -85,13 +143,18 @@ def try_image_variation(
         image: ImageArtifact,
         negative_prompts: Optional[list[str]] = None,
     ) -> ImageArtifact:
-        image_size = self._dall_e_2_filter_image_size("variation")
+        """Creates a variation of an image.
 
+        Only supported by for dall-e-2. Requires image size to be one of the following:
+            [256x256, 512x512, 1024x1024]
+        """
+        if self.model != "dall-e-2":
+            raise NotImplementedError("Image variation only supports dall-e-2")
         response = self.client.images.create_variation(
             image=image.value,
             n=1,
             response_format=self.response_format,
-            size=image_size,
+            size=self.image_size,  # pyright: ignore[reportArgumentType]
         )
 
         return self._parse_image_response(response, "")
@@ -103,15 +166,17 @@ def try_image_inpainting(
         mask: ImageArtifact,
         negative_prompts: Optional[list[str]] = None,
     ) -> ImageArtifact:
-        image_size = self._dall_e_2_filter_image_size("inpainting")
-
         prompt = ", ".join(prompts)
         response = self.client.images.edit(
             prompt=prompt,
             image=image.value,
             mask=mask.value,
-            response_format=self.response_format,
-            size=image_size,
+            **self._build_model_params(
+                {
+                    "size": "image_size",
+                    "response_format": "response_format",
+                }
+            ),
         )
 
         return self._parse_image_response(response, prompt)
@@ -125,29 +190,45 @@ def try_image_outpainting(
     ) -> ImageArtifact:
         raise NotImplementedError(f"{self.__class__.__name__} does not support outpainting")
 
-    def _image_size_to_ints(self, image_size: str) -> list[int]:
-        return [int(x) for x in image_size.split("x")]
-
-    def _dall_e_2_filter_image_size(self, method: str) -> Literal["256x256", "512x512", "1024x1024"]:
-        if self.model != "dall-e-2":
-            raise NotImplementedError(f"{method} only supports dall-e-2")
-
-        if self.image_size not in {"256x256", "512x512", "1024x1024"}:
-            raise ValueError(f"support image sizes for {method} are 256x256, 512x512, and 1024x1024")
-
-        return cast("Literal['256x256', '512x512', '1024x1024']", self.image_size)
-
     def _parse_image_response(self, response: ImagesResponse, prompt: str) -> ImageArtifact:
+        from griptape.loaders.image_loader import ImageLoader
+
         if response.data is None or response.data[0] is None or response.data[0].b64_json is None:
             raise Exception("Failed to generate image")
 
         image_data = base64.b64decode(response.data[0].b64_json)
-        image_dimensions = self._image_size_to_ints(self.image_size)
-
-        return ImageArtifact(
-            value=image_data,
-            format="png",
-            width=image_dimensions[0],
-            height=image_dimensions[1],
-            meta={"model": self.model, "prompt": prompt},
-        )
+
+        image_artifact = ImageLoader().parse(image_data)
+
+        image_artifact.meta["prompt"] = prompt
+        image_artifact.meta["model"] = self.model
+
+        return image_artifact
+
+    def _build_model_params(self, values: dict) -> dict:
+        """Builds parameters while considering field metadata and None values.
+
+        Args:
+            values: A dictionary mapping parameter names to field names.
+
+        Field will be added to the params dictionary if all conditions are met:
+            - The field value is not None
+            - The model_allowlist is None or the model is in the allowlist
+            - The model_denylist is None or the model is not in the denylist
+        """
+        params = {}
+
+        fields = fields_dict(self.__class__)
+        for param_name, field_name in values.items():
+            metadata = fields[field_name].metadata
+            model_allowlist = metadata.get("model_allowlist")
+            model_denylist = metadata.get("model_denylist")
+
+            field_value = getattr(self, field_name, None)
+
+            allowlist_condition = model_allowlist is None or self.model in model_allowlist
+            denylist_condition = model_denylist is None or self.model not in model_denylist
+
+            if field_value is not None and allowlist_condition and denylist_condition:
+                params[param_name] = field_value
+        return params
diff --git a/tests/unit/configs/drivers/test_azure_openai_drivers_config.py b/tests/unit/configs/drivers/test_azure_openai_drivers_config.py
@@ -58,12 +58,16 @@ def test_to_dict(self, config):
             "image_generation_driver": {
                 "api_version": "2024-02-01",
                 "base_url": None,
+                "background": None,
                 "image_size": "512x512",
                 "model": "dall-e-2",
+                "moderation": None,
                 "azure_deployment": "dall-e-2",
                 "azure_endpoint": "http://localhost:8080",
                 "organization": None,
-                "quality": "standard",
+                "output_compression": None,
+                "output_format": None,
+                "quality": None,
                 "response_format": "b64_json",
                 "style": None,
                 "type": "AzureOpenAiImageGenerationDriver",
diff --git a/tests/unit/configs/drivers/test_openai_driver_config.py b/tests/unit/configs/drivers/test_openai_driver_config.py
@@ -47,10 +47,14 @@ def test_to_dict(self, config):
             "image_generation_driver": {
                 "api_version": None,
                 "base_url": None,
+                "background": None,
                 "image_size": "512x512",
                 "model": "dall-e-2",
                 "organization": None,
-                "quality": "standard",
+                "output_compression": None,
+                "output_format": None,
+                "moderation": None,
+                "quality": None,
                 "response_format": "b64_json",
                 "style": None,
                 "type": "OpenAiImageGenerationDriver",
diff --git a/tests/unit/drivers/image_generation/test_azure_openai_image_generation_driver.py b/tests/unit/drivers/image_generation/test_azure_openai_image_generation_driver.py
@@ -1,9 +1,37 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
 from unittest.mock import Mock
 
+import PIL.Image
 import pytest
 
 from griptape.drivers.image_generation.openai import AzureOpenAiImageGenerationDriver
 
+if TYPE_CHECKING:
+    import io
+
+
+@pytest.fixture(autouse=True)
+def _patch_pillow_open(mocker):
+    """Stub out PIL.Image.open so no real decoding is attempted."""
+
+    class _FakeImage:
+        def __init__(self) -> None:
+            self.format: str = "PNG"
+            self.width: int = 512
+            self.height: int = 512
+
+        def save(self, fp: io.BytesIO, *, _: Optional[str] = None) -> None:
+            fp.write(b"image data")
+
+    mocker.patch.object(
+        PIL.Image,
+        "open",
+        side_effect=lambda *_, **__: _FakeImage(),
+        autospec=True,
+    )
+
 
 class TestAzureOpenAiImageGenerationDriver:
     @pytest.fixture()
@@ -13,27 +41,18 @@ def driver(self):
             client=Mock(),
             azure_endpoint="https://dalle.example.com",
             azure_deployment="dalle-deployment",
-            image_size="512x512",
+            image_size="1024x1024",
         )
 
     def test_init(self, driver):
         assert driver
         assert (
             AzureOpenAiImageGenerationDriver(
-                model="dall-e-3", client=Mock(), azure_endpoint="https://dalle.example.com", image_size="512x512"
+                model="dall-e-3", client=Mock(), azure_endpoint="https://dalle.example.com", image_size="1024x1024"
             ).azure_deployment
             == "dall-e-3"
         )
 
-    def test_init_requires_endpoint(self):
-        with pytest.raises(TypeError):
-            AzureOpenAiImageGenerationDriver(
-                model="dall-e-3",
-                client=Mock(),
-                azure_deployment="dalle-deployment",
-                image_size="512x512",
-            )  # pyright: ignore[reportCallIssues]
-
     def test_try_text_to_image(self, driver):
         driver.client.images.generate.return_value = Mock(data=[Mock(b64_json=b"aW1hZ2UgZGF0YQ==")])
 
diff --git a/tests/unit/drivers/image_generation/test_openai_image_generation_driver.py b/tests/unit/drivers/image_generation/test_openai_image_generation_driver.py

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`from griptape.tools import PromptImageGenerationTool`
`4`	`4`
`5`	`5`	`driver = OpenAiImageGenerationDriver(`
`6`		`- model="dall-e-2",`
	`6`	`+ model="gpt-image-1",`
`7`	`7`	`)`
`8`	`8`
`9`	`9`	`agent = Agent(`