diff --git a/.release-please-manifest.json b/.release-please-manifest.json index f2761d4022..12cec28d56 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "1.106.1" + ".": "1.107.0" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index c41be6ee57..36a3c7f587 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 118 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-51afd6abbcb18c3086f62993f9379c18443b9e516cbc0548ddfb932e835657f8.yml -openapi_spec_hash: dae6afeaefa15cb8700c7a870531e06f -config_hash: b854932c0ea24b400bdd64e4376936bd +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-7807ec6037efcee1af7decbfd3974a42b761fb6c6a71b4050fe43484d7fcbac4.yml +openapi_spec_hash: da6851e3891ad2659a50ed6a736fd32a +config_hash: 74d955cdc2377213f5268ea309090f6c diff --git a/CHANGELOG.md b/CHANGELOG.md index c0ad7d1490..76d5dcb2dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## 1.107.0 (2025-09-08) + +Full Changelog: [v1.106.1...v1.107.0](https://github.com/openai/openai-python/compare/v1.106.1...v1.107.0) + +### Features + +* **api:** ship the RealtimeGA API shape ([dc319d8](https://github.com/openai/openai-python/commit/dc319d8bbb3a20108399c1d15f98e63bdd84eb5c)) + + +### Chores + +* **internal:** codegen related update ([b79b7ca](https://github.com/openai/openai-python/commit/b79b7ca3a72009a036db0a344b500f616ca0443f)) + ## 1.106.1 (2025-09-04) Full Changelog: [v1.106.0...v1.106.1](https://github.com/openai/openai-python/compare/v1.106.0...v1.106.1) diff --git a/api.md b/api.md index a8a95bd23e..7c947fffe1 100644 --- a/api.md +++ b/api.md @@ -863,6 +863,7 @@ Types: ```python from openai.types.realtime import ( + AudioTranscription, ConversationCreatedEvent, ConversationItem, ConversationItemAdded, @@ -891,11 +892,16 @@ from openai.types.realtime import ( McpListToolsCompleted, McpListToolsFailed, McpListToolsInProgress, + Models, + NoiseReductionType, OutputAudioBufferClearEvent, RateLimitsUpdatedEvent, RealtimeAudioConfig, + RealtimeAudioConfigInput, + RealtimeAudioConfigOutput, + RealtimeAudioFormats, + RealtimeAudioInputTurnDetection, RealtimeClientEvent, - RealtimeClientSecretConfig, RealtimeConversationItemAssistantMessage, RealtimeConversationItemFunctionCall, RealtimeConversationItemFunctionCallOutput, @@ -911,6 +917,9 @@ from openai.types.realtime import ( RealtimeMcpToolExecutionError, RealtimeMcphttpError, RealtimeResponse, + RealtimeResponseCreateAudioOutput, + RealtimeResponseCreateMcpTool, + RealtimeResponseCreateParams, RealtimeResponseStatus, RealtimeResponseUsage, RealtimeResponseUsageInputTokenDetails, @@ -922,8 +931,12 @@ from openai.types.realtime import ( RealtimeToolsConfig, RealtimeToolsConfigUnion, RealtimeTracingConfig, + RealtimeTranscriptionSessionAudio, + RealtimeTranscriptionSessionAudioInput, + RealtimeTranscriptionSessionAudioInputTurnDetection, RealtimeTranscriptionSessionCreateRequest, RealtimeTruncation, + RealtimeTruncationRetentionRatio, ResponseAudioDeltaEvent, ResponseAudioDoneEvent, ResponseAudioTranscriptDeltaEvent, @@ -959,7 +972,15 @@ from openai.types.realtime import ( Types: ```python -from openai.types.realtime import RealtimeSessionCreateResponse, ClientSecretCreateResponse +from openai.types.realtime import ( + RealtimeSessionClientSecret, + RealtimeSessionCreateResponse, + RealtimeTranscriptionSessionClientSecret, + RealtimeTranscriptionSessionCreateResponse, + RealtimeTranscriptionSessionInputAudioTranscription, + RealtimeTranscriptionSessionTurnDetection, + ClientSecretCreateResponse, +) ``` Methods: diff --git a/pyproject.toml b/pyproject.toml index 82aa72b045..5c3985cc7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openai" -version = "1.106.1" +version = "1.107.0" description = "The official Python library for the openai API" dynamic = ["readme"] license = "Apache-2.0" diff --git a/requirements-dev.lock b/requirements-dev.lock index 669378387d..7d690683e9 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -70,7 +70,7 @@ filelock==3.12.4 frozenlist==1.7.0 # via aiohttp # via aiosignal -griffe==1.13.0 +griffe==1.14.0 h11==0.16.0 # via httpcore httpcore==1.0.9 diff --git a/src/openai/_version.py b/src/openai/_version.py index 33c16fef6a..06826fc4de 100644 --- a/src/openai/_version.py +++ b/src/openai/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "openai" -__version__ = "1.106.1" # x-release-please-version +__version__ = "1.107.0" # x-release-please-version diff --git a/src/openai/resources/realtime/client_secrets.py b/src/openai/resources/realtime/client_secrets.py index ba0f9ee538..a79460746d 100644 --- a/src/openai/resources/realtime/client_secrets.py +++ b/src/openai/resources/realtime/client_secrets.py @@ -50,11 +50,13 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> ClientSecretCreateResponse: """ - Create a Realtime session and client secret for either realtime or - transcription. + Create a Realtime client secret with an associated session configuration. Args: - expires_after: Configuration for the ephemeral token expiration. + expires_after: Configuration for the client secret expiration. Expiration refers to the time + after which a client secret will no longer be valid for creating sessions. The + session itself may continue after that time once started. A secret can be used + to create multiple sessions until it expires. session: Session configuration to use for the client secret. Choose either a realtime session or a transcription session. @@ -116,11 +118,13 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, ) -> ClientSecretCreateResponse: """ - Create a Realtime session and client secret for either realtime or - transcription. + Create a Realtime client secret with an associated session configuration. Args: - expires_after: Configuration for the ephemeral token expiration. + expires_after: Configuration for the client secret expiration. Expiration refers to the time + after which a client secret will no longer be valid for creating sessions. The + session itself may continue after that time once started. A secret can be used + to create multiple sessions until it expires. session: Session configuration to use for the client secret. Choose either a realtime session or a transcription session. diff --git a/src/openai/resources/realtime/realtime.py b/src/openai/resources/realtime/realtime.py index 2f5adf6548..81e6dc54f5 100644 --- a/src/openai/resources/realtime/realtime.py +++ b/src/openai/resources/realtime/realtime.py @@ -32,16 +32,13 @@ ClientSecretsWithStreamingResponse, AsyncClientSecretsWithStreamingResponse, ) -from ...types.realtime import response_create_event_param +from ...types.realtime import session_update_event_param, transcription_session_update_param from ...types.websocket_connection_options import WebsocketConnectionOptions from ...types.realtime.realtime_client_event import RealtimeClientEvent from ...types.realtime.realtime_server_event import RealtimeServerEvent from ...types.realtime.conversation_item_param import ConversationItemParam from ...types.realtime.realtime_client_event_param import RealtimeClientEventParam -from ...types.realtime.realtime_session_create_request_param import RealtimeSessionCreateRequestParam -from ...types.realtime.realtime_transcription_session_create_request_param import ( - RealtimeTranscriptionSessionCreateRequestParam, -) +from ...types.realtime.realtime_response_create_params_param import RealtimeResponseCreateParamsParam if TYPE_CHECKING: from websockets.sync.client import ClientConnection as WebsocketConnection @@ -564,18 +561,18 @@ def __init__(self, connection: RealtimeConnection) -> None: class RealtimeSessionResource(BaseRealtimeConnectionResource): - def update(self, *, session: RealtimeSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN) -> None: + def update(self, *, session: session_update_event_param.Session, event_id: str | NotGiven = NOT_GIVEN) -> None: """ - Send this event to update the session’s default configuration. - The client may send this event at any time to update any field, - except for `voice`. However, note that once a session has been - initialized with a particular `model`, it can’t be changed to - another model using `session.update`. + Send this event to update the session’s configuration. + The client may send this event at any time to update any field + except for `voice` and `model`. `voice` can be updated only if there have been no other + audio outputs yet. When the server receives a `session.update`, it will respond with a `session.updated` event showing the full, effective configuration. - Only the fields that are present are updated. To clear a field like - `instructions`, pass an empty string. + Only the fields that are present in the `session.update` are updated. To clear a field like + `instructions`, pass an empty string. To clear a field like `tools`, pass an empty array. + To clear a field like `turn_detection`, pass `null`. """ self._connection.send( cast( @@ -590,7 +587,7 @@ def create( self, *, event_id: str | NotGiven = NOT_GIVEN, - response: response_create_event_param.Response | NotGiven = NOT_GIVEN, + response: RealtimeResponseCreateParamsParam | NotGiven = NOT_GIVEN, ) -> None: """ This event instructs the server to create a Response, which means triggering @@ -599,15 +596,25 @@ def create( A Response will include at least one Item, and may have two, in which case the second will be a function call. These Items will be appended to the - conversation history. + conversation history by default. The server will respond with a `response.created` event, events for Items and content created, and finally a `response.done` event to indicate the Response is complete. The `response.create` event includes inference configuration like - `instructions`, and `temperature`. These fields will override the Session's + `instructions` and `tools`. If these are set, they will override the Session's configuration for this Response only. + + Responses can be created out-of-band of the default Conversation, meaning that they can + have arbitrary input, and it's possible to disable writing the output to the Conversation. + Only one Response can write to the default Conversation at a time, but otherwise multiple + Responses can be created in parallel. The `metadata` field is a good way to disambiguate + multiple simultaneous Responses. + + Clients can set `conversation` to `none` to create a Response that does not write to the default + Conversation. Arbitrary input can be provided with the `input` field, which is an array accepting + raw Items and references to existing Items. """ self._connection.send( cast( @@ -621,7 +628,9 @@ def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str | Not The server will respond with a `response.done` event with a status of `response.status=cancelled`. If - there is no response to cancel, the server will respond with an error. + there is no response to cancel, the server will respond with an error. It's safe + to call `response.cancel` even if no response is in progress, an error will be + returned the session will remain unaffected. """ self._connection.send( cast( @@ -644,16 +653,9 @@ def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: """ - Send this event to commit the user input audio buffer, which will create a - new user message item in the conversation. This event will produce an error - if the input audio buffer is empty. When in Server VAD mode, the client does - not need to send this event, the server will commit the audio buffer - automatically. + Send this event to commit the user input audio buffer, which will create a new user message item in the conversation. This event will produce an error if the input audio buffer is empty. When in Server VAD mode, the client does not need to send this event, the server will commit the audio buffer automatically. - Committing the input audio buffer will trigger input audio transcription - (if enabled in session configuration), but it will not create a response - from the model. The server will respond with an `input_audio_buffer.committed` - event. + Committing the input audio buffer will trigger input audio transcription (if enabled in session configuration), but it will not create a response from the model. The server will respond with an `input_audio_buffer.committed` event. """ self._connection.send( cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id})) @@ -663,14 +665,17 @@ def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> None: """Send this event to append audio bytes to the input audio buffer. The audio - buffer is temporary storage you can write to and later commit. In Server VAD - mode, the audio buffer is used to detect speech and the server will decide + buffer is temporary storage you can write to and later commit. A "commit" will create a new + user message item in the conversation history from the buffer content and clear the buffer. + Input audio transcription (if enabled) will be generated when the buffer is committed. + + If VAD is enabled the audio buffer is used to detect speech and the server will decide when to commit. When Server VAD is disabled, you must commit the audio buffer - manually. + manually. Input audio noise reduction operates on writes to the audio buffer. The client may choose how much audio to place in each event up to a maximum of 15 MiB, for example streaming smaller chunks from the client may allow the - VAD to be more responsive. Unlike made other client events, the server will + VAD to be more responsive. Unlike most other client events, the server will not send a confirmation response to this event. """ self._connection.send( @@ -797,7 +802,7 @@ def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: class RealtimeTranscriptionSessionResource(BaseRealtimeConnectionResource): def update( - self, *, session: RealtimeTranscriptionSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN + self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN ) -> None: """Send this event to update a transcription session.""" self._connection.send( @@ -814,18 +819,20 @@ def __init__(self, connection: AsyncRealtimeConnection) -> None: class AsyncRealtimeSessionResource(BaseAsyncRealtimeConnectionResource): - async def update(self, *, session: RealtimeSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN) -> None: + async def update( + self, *, session: session_update_event_param.Session, event_id: str | NotGiven = NOT_GIVEN + ) -> None: """ - Send this event to update the session’s default configuration. - The client may send this event at any time to update any field, - except for `voice`. However, note that once a session has been - initialized with a particular `model`, it can’t be changed to - another model using `session.update`. + Send this event to update the session’s configuration. + The client may send this event at any time to update any field + except for `voice` and `model`. `voice` can be updated only if there have been no other + audio outputs yet. When the server receives a `session.update`, it will respond with a `session.updated` event showing the full, effective configuration. - Only the fields that are present are updated. To clear a field like - `instructions`, pass an empty string. + Only the fields that are present in the `session.update` are updated. To clear a field like + `instructions`, pass an empty string. To clear a field like `tools`, pass an empty array. + To clear a field like `turn_detection`, pass `null`. """ await self._connection.send( cast( @@ -840,7 +847,7 @@ async def create( self, *, event_id: str | NotGiven = NOT_GIVEN, - response: response_create_event_param.Response | NotGiven = NOT_GIVEN, + response: RealtimeResponseCreateParamsParam | NotGiven = NOT_GIVEN, ) -> None: """ This event instructs the server to create a Response, which means triggering @@ -849,15 +856,25 @@ async def create( A Response will include at least one Item, and may have two, in which case the second will be a function call. These Items will be appended to the - conversation history. + conversation history by default. The server will respond with a `response.created` event, events for Items and content created, and finally a `response.done` event to indicate the Response is complete. The `response.create` event includes inference configuration like - `instructions`, and `temperature`. These fields will override the Session's + `instructions` and `tools`. If these are set, they will override the Session's configuration for this Response only. + + Responses can be created out-of-band of the default Conversation, meaning that they can + have arbitrary input, and it's possible to disable writing the output to the Conversation. + Only one Response can write to the default Conversation at a time, but otherwise multiple + Responses can be created in parallel. The `metadata` field is a good way to disambiguate + multiple simultaneous Responses. + + Clients can set `conversation` to `none` to create a Response that does not write to the default + Conversation. Arbitrary input can be provided with the `input` field, which is an array accepting + raw Items and references to existing Items. """ await self._connection.send( cast( @@ -871,7 +888,9 @@ async def cancel(self, *, event_id: str | NotGiven = NOT_GIVEN, response_id: str The server will respond with a `response.done` event with a status of `response.status=cancelled`. If - there is no response to cancel, the server will respond with an error. + there is no response to cancel, the server will respond with an error. It's safe + to call `response.cancel` even if no response is in progress, an error will be + returned the session will remain unaffected. """ await self._connection.send( cast( @@ -894,16 +913,9 @@ async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: async def commit(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: """ - Send this event to commit the user input audio buffer, which will create a - new user message item in the conversation. This event will produce an error - if the input audio buffer is empty. When in Server VAD mode, the client does - not need to send this event, the server will commit the audio buffer - automatically. + Send this event to commit the user input audio buffer, which will create a new user message item in the conversation. This event will produce an error if the input audio buffer is empty. When in Server VAD mode, the client does not need to send this event, the server will commit the audio buffer automatically. - Committing the input audio buffer will trigger input audio transcription - (if enabled in session configuration), but it will not create a response - from the model. The server will respond with an `input_audio_buffer.committed` - event. + Committing the input audio buffer will trigger input audio transcription (if enabled in session configuration), but it will not create a response from the model. The server will respond with an `input_audio_buffer.committed` event. """ await self._connection.send( cast(RealtimeClientEventParam, strip_not_given({"type": "input_audio_buffer.commit", "event_id": event_id})) @@ -913,14 +925,17 @@ async def append(self, *, audio: str, event_id: str | NotGiven = NOT_GIVEN) -> N """Send this event to append audio bytes to the input audio buffer. The audio - buffer is temporary storage you can write to and later commit. In Server VAD - mode, the audio buffer is used to detect speech and the server will decide + buffer is temporary storage you can write to and later commit. A "commit" will create a new + user message item in the conversation history from the buffer content and clear the buffer. + Input audio transcription (if enabled) will be generated when the buffer is committed. + + If VAD is enabled the audio buffer is used to detect speech and the server will decide when to commit. When Server VAD is disabled, you must commit the audio buffer - manually. + manually. Input audio noise reduction operates on writes to the audio buffer. The client may choose how much audio to place in each event up to a maximum of 15 MiB, for example streaming smaller chunks from the client may allow the - VAD to be more responsive. Unlike made other client events, the server will + VAD to be more responsive. Unlike most other client events, the server will not send a confirmation response to this event. """ await self._connection.send( @@ -1047,7 +1062,7 @@ async def clear(self, *, event_id: str | NotGiven = NOT_GIVEN) -> None: class AsyncRealtimeTranscriptionSessionResource(BaseAsyncRealtimeConnectionResource): async def update( - self, *, session: RealtimeTranscriptionSessionCreateRequestParam, event_id: str | NotGiven = NOT_GIVEN + self, *, session: transcription_session_update_param.Session, event_id: str | NotGiven = NOT_GIVEN ) -> None: """Send this event to update a transcription session.""" await self._connection.send( diff --git a/src/openai/types/realtime/__init__.py b/src/openai/types/realtime/__init__.py index b05f620619..6873ba6a2a 100644 --- a/src/openai/types/realtime/__init__.py +++ b/src/openai/types/realtime/__init__.py @@ -2,13 +2,16 @@ from __future__ import annotations +from .models import Models as Models +from .models_param import ModelsParam as ModelsParam from .realtime_error import RealtimeError as RealtimeError -from .realtime_session import RealtimeSession as RealtimeSession from .conversation_item import ConversationItem as ConversationItem from .realtime_response import RealtimeResponse as RealtimeResponse +from .audio_transcription import AudioTranscription as AudioTranscription from .log_prob_properties import LogProbProperties as LogProbProperties from .realtime_truncation import RealtimeTruncation as RealtimeTruncation from .response_done_event import ResponseDoneEvent as ResponseDoneEvent +from .noise_reduction_type import NoiseReductionType as NoiseReductionType from .realtime_error_event import RealtimeErrorEvent as RealtimeErrorEvent from .session_update_event import SessionUpdateEvent as SessionUpdateEvent from .mcp_list_tools_failed import McpListToolsFailed as McpListToolsFailed @@ -21,6 +24,7 @@ from .session_created_event import SessionCreatedEvent as SessionCreatedEvent from .session_updated_event import SessionUpdatedEvent as SessionUpdatedEvent from .conversation_item_done import ConversationItemDone as ConversationItemDone +from .realtime_audio_formats import RealtimeAudioFormats as RealtimeAudioFormats from .realtime_mcp_tool_call import RealtimeMcpToolCall as RealtimeMcpToolCall from .realtime_mcphttp_error import RealtimeMcphttpError as RealtimeMcphttpError from .response_created_event import ResponseCreatedEvent as ResponseCreatedEvent @@ -34,6 +38,7 @@ from .realtime_response_status import RealtimeResponseStatus as RealtimeResponseStatus from .response_mcp_call_failed import ResponseMcpCallFailed as ResponseMcpCallFailed from .response_text_done_event import ResponseTextDoneEvent as ResponseTextDoneEvent +from .audio_transcription_param import AudioTranscriptionParam as AudioTranscriptionParam from .rate_limits_updated_event import RateLimitsUpdatedEvent as RateLimitsUpdatedEvent from .realtime_truncation_param import RealtimeTruncationParam as RealtimeTruncationParam from .response_audio_done_event import ResponseAudioDoneEvent as ResponseAudioDoneEvent @@ -43,6 +48,7 @@ from .response_audio_delta_event import ResponseAudioDeltaEvent as ResponseAudioDeltaEvent from .session_update_event_param import SessionUpdateEventParam as SessionUpdateEventParam from .client_secret_create_params import ClientSecretCreateParams as ClientSecretCreateParams +from .realtime_audio_config_input import RealtimeAudioConfigInput as RealtimeAudioConfigInput from .realtime_audio_config_param import RealtimeAudioConfigParam as RealtimeAudioConfigParam from .realtime_client_event_param import RealtimeClientEventParam as RealtimeClientEventParam from .realtime_mcp_protocol_error import RealtimeMcpProtocolError as RealtimeMcpProtocolError @@ -52,11 +58,12 @@ from .response_cancel_event_param import ResponseCancelEventParam as ResponseCancelEventParam from .response_create_event_param import ResponseCreateEventParam as ResponseCreateEventParam from .response_mcp_call_completed import ResponseMcpCallCompleted as ResponseMcpCallCompleted +from .realtime_audio_config_output import RealtimeAudioConfigOutput as RealtimeAudioConfigOutput +from .realtime_audio_formats_param import RealtimeAudioFormatsParam as RealtimeAudioFormatsParam from .realtime_mcp_tool_call_param import RealtimeMcpToolCallParam as RealtimeMcpToolCallParam from .realtime_mcphttp_error_param import RealtimeMcphttpErrorParam as RealtimeMcphttpErrorParam from .transcription_session_update import TranscriptionSessionUpdate as TranscriptionSessionUpdate from .client_secret_create_response import ClientSecretCreateResponse as ClientSecretCreateResponse -from .realtime_client_secret_config import RealtimeClientSecretConfig as RealtimeClientSecretConfig from .realtime_mcp_approval_request import RealtimeMcpApprovalRequest as RealtimeMcpApprovalRequest from .realtime_mcp_list_tools_param import RealtimeMcpListToolsParam as RealtimeMcpListToolsParam from .realtime_tracing_config_param import RealtimeTracingConfigParam as RealtimeTracingConfigParam @@ -66,11 +73,13 @@ from .conversation_item_delete_event import ConversationItemDeleteEvent as ConversationItemDeleteEvent from .input_audio_buffer_clear_event import InputAudioBufferClearEvent as InputAudioBufferClearEvent from .realtime_mcp_approval_response import RealtimeMcpApprovalResponse as RealtimeMcpApprovalResponse +from .realtime_session_client_secret import RealtimeSessionClientSecret as RealtimeSessionClientSecret from .conversation_item_created_event import ConversationItemCreatedEvent as ConversationItemCreatedEvent from .conversation_item_deleted_event import ConversationItemDeletedEvent as ConversationItemDeletedEvent from .input_audio_buffer_append_event import InputAudioBufferAppendEvent as InputAudioBufferAppendEvent from .input_audio_buffer_commit_event import InputAudioBufferCommitEvent as InputAudioBufferCommitEvent from .output_audio_buffer_clear_event import OutputAudioBufferClearEvent as OutputAudioBufferClearEvent +from .realtime_response_create_params import RealtimeResponseCreateParams as RealtimeResponseCreateParams from .realtime_session_create_request import RealtimeSessionCreateRequest as RealtimeSessionCreateRequest from .response_output_item_done_event import ResponseOutputItemDoneEvent as ResponseOutputItemDoneEvent from .conversation_item_retrieve_event import ConversationItemRetrieveEvent as ConversationItemRetrieveEvent @@ -81,26 +90,37 @@ from .response_mcp_call_arguments_done import ResponseMcpCallArgumentsDone as ResponseMcpCallArgumentsDone from .response_output_item_added_event import ResponseOutputItemAddedEvent as ResponseOutputItemAddedEvent from .conversation_item_truncated_event import ConversationItemTruncatedEvent as ConversationItemTruncatedEvent +from .realtime_audio_config_input_param import RealtimeAudioConfigInputParam as RealtimeAudioConfigInputParam from .realtime_mcp_protocol_error_param import RealtimeMcpProtocolErrorParam as RealtimeMcpProtocolErrorParam from .realtime_mcp_tool_execution_error import RealtimeMcpToolExecutionError as RealtimeMcpToolExecutionError +from .realtime_response_create_mcp_tool import RealtimeResponseCreateMcpTool as RealtimeResponseCreateMcpTool from .realtime_tool_choice_config_param import RealtimeToolChoiceConfigParam as RealtimeToolChoiceConfigParam from .realtime_tools_config_union_param import RealtimeToolsConfigUnionParam as RealtimeToolsConfigUnionParam from .response_content_part_added_event import ResponseContentPartAddedEvent as ResponseContentPartAddedEvent from .response_mcp_call_arguments_delta import ResponseMcpCallArgumentsDelta as ResponseMcpCallArgumentsDelta from .input_audio_buffer_committed_event import InputAudioBufferCommittedEvent as InputAudioBufferCommittedEvent +from .realtime_audio_config_output_param import RealtimeAudioConfigOutputParam as RealtimeAudioConfigOutputParam from .transcription_session_update_param import TranscriptionSessionUpdateParam as TranscriptionSessionUpdateParam -from .realtime_client_secret_config_param import RealtimeClientSecretConfigParam as RealtimeClientSecretConfigParam +from .realtime_audio_input_turn_detection import RealtimeAudioInputTurnDetection as RealtimeAudioInputTurnDetection from .realtime_mcp_approval_request_param import RealtimeMcpApprovalRequestParam as RealtimeMcpApprovalRequestParam +from .realtime_truncation_retention_ratio import RealtimeTruncationRetentionRatio as RealtimeTruncationRetentionRatio from .transcription_session_updated_event import TranscriptionSessionUpdatedEvent as TranscriptionSessionUpdatedEvent from .conversation_item_create_event_param import ConversationItemCreateEventParam as ConversationItemCreateEventParam from .conversation_item_delete_event_param import ConversationItemDeleteEventParam as ConversationItemDeleteEventParam from .input_audio_buffer_clear_event_param import InputAudioBufferClearEventParam as InputAudioBufferClearEventParam from .input_audio_buffer_timeout_triggered import InputAudioBufferTimeoutTriggered as InputAudioBufferTimeoutTriggered from .realtime_mcp_approval_response_param import RealtimeMcpApprovalResponseParam as RealtimeMcpApprovalResponseParam +from .realtime_transcription_session_audio import RealtimeTranscriptionSessionAudio as RealtimeTranscriptionSessionAudio from .response_audio_transcript_done_event import ResponseAudioTranscriptDoneEvent as ResponseAudioTranscriptDoneEvent from .input_audio_buffer_append_event_param import InputAudioBufferAppendEventParam as InputAudioBufferAppendEventParam from .input_audio_buffer_commit_event_param import InputAudioBufferCommitEventParam as InputAudioBufferCommitEventParam from .output_audio_buffer_clear_event_param import OutputAudioBufferClearEventParam as OutputAudioBufferClearEventParam +from .realtime_response_create_audio_output import ( + RealtimeResponseCreateAudioOutput as RealtimeResponseCreateAudioOutput, +) +from .realtime_response_create_params_param import ( + RealtimeResponseCreateParamsParam as RealtimeResponseCreateParamsParam, +) from .realtime_session_create_request_param import ( RealtimeSessionCreateRequestParam as RealtimeSessionCreateRequestParam, ) @@ -125,12 +145,30 @@ from .realtime_mcp_tool_execution_error_param import ( RealtimeMcpToolExecutionErrorParam as RealtimeMcpToolExecutionErrorParam, ) +from .realtime_response_create_mcp_tool_param import ( + RealtimeResponseCreateMcpToolParam as RealtimeResponseCreateMcpToolParam, +) from .realtime_conversation_item_function_call import ( RealtimeConversationItemFunctionCall as RealtimeConversationItemFunctionCall, ) +from .realtime_audio_input_turn_detection_param import ( + RealtimeAudioInputTurnDetectionParam as RealtimeAudioInputTurnDetectionParam, +) from .realtime_conversation_item_system_message import ( RealtimeConversationItemSystemMessage as RealtimeConversationItemSystemMessage, ) +from .realtime_truncation_retention_ratio_param import ( + RealtimeTruncationRetentionRatioParam as RealtimeTruncationRetentionRatioParam, +) +from .realtime_transcription_session_audio_input import ( + RealtimeTranscriptionSessionAudioInput as RealtimeTranscriptionSessionAudioInput, +) +from .realtime_transcription_session_audio_param import ( + RealtimeTranscriptionSessionAudioParam as RealtimeTranscriptionSessionAudioParam, +) +from .realtime_response_create_audio_output_param import ( + RealtimeResponseCreateAudioOutputParam as RealtimeResponseCreateAudioOutputParam, +) from .realtime_response_usage_input_token_details import ( RealtimeResponseUsageInputTokenDetails as RealtimeResponseUsageInputTokenDetails, ) @@ -143,6 +181,9 @@ from .realtime_response_usage_output_token_details import ( RealtimeResponseUsageOutputTokenDetails as RealtimeResponseUsageOutputTokenDetails, ) +from .realtime_transcription_session_client_secret import ( + RealtimeTranscriptionSessionClientSecret as RealtimeTranscriptionSessionClientSecret, +) from .response_function_call_arguments_delta_event import ( ResponseFunctionCallArgumentsDeltaEvent as ResponseFunctionCallArgumentsDeltaEvent, ) @@ -152,15 +193,24 @@ from .realtime_transcription_session_create_request import ( RealtimeTranscriptionSessionCreateRequest as RealtimeTranscriptionSessionCreateRequest, ) +from .realtime_transcription_session_turn_detection import ( + RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection, +) from .realtime_conversation_item_function_call_param import ( RealtimeConversationItemFunctionCallParam as RealtimeConversationItemFunctionCallParam, ) +from .realtime_transcription_session_create_response import ( + RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse, +) from .realtime_conversation_item_function_call_output import ( RealtimeConversationItemFunctionCallOutput as RealtimeConversationItemFunctionCallOutput, ) from .realtime_conversation_item_system_message_param import ( RealtimeConversationItemSystemMessageParam as RealtimeConversationItemSystemMessageParam, ) +from .realtime_transcription_session_audio_input_param import ( + RealtimeTranscriptionSessionAudioInputParam as RealtimeTranscriptionSessionAudioInputParam, +) from .realtime_conversation_item_assistant_message_param import ( RealtimeConversationItemAssistantMessageParam as RealtimeConversationItemAssistantMessageParam, ) @@ -179,6 +229,15 @@ from .conversation_item_input_audio_transcription_failed_event import ( ConversationItemInputAudioTranscriptionFailedEvent as ConversationItemInputAudioTranscriptionFailedEvent, ) +from .realtime_transcription_session_input_audio_transcription import ( + RealtimeTranscriptionSessionInputAudioTranscription as RealtimeTranscriptionSessionInputAudioTranscription, +) +from .realtime_transcription_session_audio_input_turn_detection import ( + RealtimeTranscriptionSessionAudioInputTurnDetection as RealtimeTranscriptionSessionAudioInputTurnDetection, +) from .conversation_item_input_audio_transcription_completed_event import ( ConversationItemInputAudioTranscriptionCompletedEvent as ConversationItemInputAudioTranscriptionCompletedEvent, ) +from .realtime_transcription_session_audio_input_turn_detection_param import ( + RealtimeTranscriptionSessionAudioInputTurnDetectionParam as RealtimeTranscriptionSessionAudioInputTurnDetectionParam, +) diff --git a/src/openai/types/realtime/audio_transcription.py b/src/openai/types/realtime/audio_transcription.py new file mode 100644 index 0000000000..cf662b3aa2 --- /dev/null +++ b/src/openai/types/realtime/audio_transcription.py @@ -0,0 +1,36 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["AudioTranscription"] + + +class AudioTranscription(BaseModel): + language: Optional[str] = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + + model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = ( + None + ) + """The model to use for transcription. + + Current options are `whisper-1`, `gpt-4o-transcribe-latest`, + `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + """ + + prompt: Optional[str] = None + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". + """ diff --git a/src/openai/types/realtime/audio_transcription_param.py b/src/openai/types/realtime/audio_transcription_param.py new file mode 100644 index 0000000000..fb09f105b8 --- /dev/null +++ b/src/openai/types/realtime/audio_transcription_param.py @@ -0,0 +1,33 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import Literal, TypedDict + +__all__ = ["AudioTranscriptionParam"] + + +class AudioTranscriptionParam(TypedDict, total=False): + language: str + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + + model: Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"] + """The model to use for transcription. + + Current options are `whisper-1`, `gpt-4o-transcribe-latest`, + `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + """ + + prompt: str + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". + """ diff --git a/src/openai/types/realtime/client_secret_create_params.py b/src/openai/types/realtime/client_secret_create_params.py index 696176e5a8..5f0b0d796f 100644 --- a/src/openai/types/realtime/client_secret_create_params.py +++ b/src/openai/types/realtime/client_secret_create_params.py @@ -13,7 +13,12 @@ class ClientSecretCreateParams(TypedDict, total=False): expires_after: ExpiresAfter - """Configuration for the ephemeral token expiration.""" + """Configuration for the client secret expiration. + + Expiration refers to the time after which a client secret will no longer be + valid for creating sessions. The session itself may continue after that time + once started. A secret can be used to create multiple sessions until it expires. + """ session: Session """Session configuration to use for the client secret. @@ -24,15 +29,17 @@ class ClientSecretCreateParams(TypedDict, total=False): class ExpiresAfter(TypedDict, total=False): anchor: Literal["created_at"] - """The anchor point for the ephemeral token expiration. - - Only `created_at` is currently supported. + """ + The anchor point for the client secret expiration, meaning that `seconds` will + be added to the `created_at` time of the client secret to produce an expiration + timestamp. Only `created_at` is currently supported. """ seconds: int """The number of seconds from the anchor point to the expiration. - Select a value between `10` and `7200`. + Select a value between `10` and `7200` (2 hours). This default to 600 seconds + (10 minutes) if not specified. """ diff --git a/src/openai/types/realtime/client_secret_create_response.py b/src/openai/types/realtime/client_secret_create_response.py index ea8b9f9ca1..8d61be3ab7 100644 --- a/src/openai/types/realtime/client_secret_create_response.py +++ b/src/openai/types/realtime/client_secret_create_response.py @@ -1,102 +1,15 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Union, Optional -from typing_extensions import Literal, TypeAlias +from typing import Union +from typing_extensions import TypeAlias from ..._models import BaseModel from .realtime_session_create_response import RealtimeSessionCreateResponse +from .realtime_transcription_session_create_response import RealtimeTranscriptionSessionCreateResponse -__all__ = [ - "ClientSecretCreateResponse", - "Session", - "SessionRealtimeTranscriptionSessionCreateResponse", - "SessionRealtimeTranscriptionSessionCreateResponseAudio", - "SessionRealtimeTranscriptionSessionCreateResponseAudioInput", - "SessionRealtimeTranscriptionSessionCreateResponseAudioInputNoiseReduction", - "SessionRealtimeTranscriptionSessionCreateResponseAudioInputTranscription", - "SessionRealtimeTranscriptionSessionCreateResponseAudioInputTurnDetection", -] +__all__ = ["ClientSecretCreateResponse", "Session"] - -class SessionRealtimeTranscriptionSessionCreateResponseAudioInputNoiseReduction(BaseModel): - type: Optional[Literal["near_field", "far_field"]] = None - - -class SessionRealtimeTranscriptionSessionCreateResponseAudioInputTranscription(BaseModel): - language: Optional[str] = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None - """The model to use for transcription. - - Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`. - """ - - prompt: Optional[str] = None - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. - """ - - -class SessionRealtimeTranscriptionSessionCreateResponseAudioInputTurnDetection(BaseModel): - prefix_padding_ms: Optional[int] = None - - silence_duration_ms: Optional[int] = None - - threshold: Optional[float] = None - - type: Optional[str] = None - """Type of turn detection, only `server_vad` is currently supported.""" - - -class SessionRealtimeTranscriptionSessionCreateResponseAudioInput(BaseModel): - format: Optional[str] = None - """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" - - noise_reduction: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInputNoiseReduction] = None - """Configuration for input audio noise reduction.""" - - transcription: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInputTranscription] = None - """Configuration of the transcription model.""" - - turn_detection: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInputTurnDetection] = None - """Configuration for turn detection.""" - - -class SessionRealtimeTranscriptionSessionCreateResponseAudio(BaseModel): - input: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudioInput] = None - - -class SessionRealtimeTranscriptionSessionCreateResponse(BaseModel): - id: Optional[str] = None - """Unique identifier for the session that looks like `sess_1234567890abcdef`.""" - - audio: Optional[SessionRealtimeTranscriptionSessionCreateResponseAudio] = None - """Configuration for input audio for the session.""" - - expires_at: Optional[int] = None - """Expiration timestamp for the session, in seconds since epoch.""" - - include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None - """Additional fields to include in server outputs. - - - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - transcription. - """ - - object: Optional[str] = None - """The object type. Always `realtime.transcription_session`.""" - - -Session: TypeAlias = Union[RealtimeSessionCreateResponse, SessionRealtimeTranscriptionSessionCreateResponse] +Session: TypeAlias = Union[RealtimeSessionCreateResponse, RealtimeTranscriptionSessionCreateResponse] class ClientSecretCreateResponse(BaseModel): diff --git a/src/openai/types/realtime/conversation_item_input_audio_transcription_completed_event.py b/src/openai/types/realtime/conversation_item_input_audio_transcription_completed_event.py index eda3f3bab6..09b20aa184 100644 --- a/src/openai/types/realtime/conversation_item_input_audio_transcription_completed_event.py +++ b/src/openai/types/realtime/conversation_item_input_audio_transcription_completed_event.py @@ -59,7 +59,7 @@ class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel): """The unique ID of the server event.""" item_id: str - """The ID of the user message item containing the audio.""" + """The ID of the item containing the audio that is being transcribed.""" transcript: str """The transcribed text.""" @@ -70,7 +70,10 @@ class ConversationItemInputAudioTranscriptionCompletedEvent(BaseModel): """ usage: Usage - """Usage statistics for the transcription.""" + """ + Usage statistics for the transcription, this is billed according to the ASR + model's pricing rather than the realtime model's pricing. + """ logprobs: Optional[List[LogProbProperties]] = None """The log probabilities of the transcription.""" diff --git a/src/openai/types/realtime/conversation_item_input_audio_transcription_delta_event.py b/src/openai/types/realtime/conversation_item_input_audio_transcription_delta_event.py index 4e9528ccb0..f49e6f636f 100644 --- a/src/openai/types/realtime/conversation_item_input_audio_transcription_delta_event.py +++ b/src/openai/types/realtime/conversation_item_input_audio_transcription_delta_event.py @@ -14,7 +14,7 @@ class ConversationItemInputAudioTranscriptionDeltaEvent(BaseModel): """The unique ID of the server event.""" item_id: str - """The ID of the item.""" + """The ID of the item containing the audio that is being transcribed.""" type: Literal["conversation.item.input_audio_transcription.delta"] """The event type, must be `conversation.item.input_audio_transcription.delta`.""" @@ -26,4 +26,11 @@ class ConversationItemInputAudioTranscriptionDeltaEvent(BaseModel): """The text delta.""" logprobs: Optional[List[LogProbProperties]] = None - """The log probabilities of the transcription.""" + """The log probabilities of the transcription. + + These can be enabled by configurating the session with + `"include": ["item.input_audio_transcription.logprobs"]`. Each entry in the + array corresponds a log probability of which token would be selected for this + chunk of transcription. This can help to identify if it was possible there were + multiple valid options for a given chunk of transcription. + """ diff --git a/src/openai/types/realtime/conversation_item_truncate_event.py b/src/openai/types/realtime/conversation_item_truncate_event.py index 63b591bfdb..d6c6779cc8 100644 --- a/src/openai/types/realtime/conversation_item_truncate_event.py +++ b/src/openai/types/realtime/conversation_item_truncate_event.py @@ -17,7 +17,7 @@ class ConversationItemTruncateEvent(BaseModel): """ content_index: int - """The index of the content part to truncate. Set this to 0.""" + """The index of the content part to truncate. Set this to `0`.""" item_id: str """The ID of the assistant message item to truncate. diff --git a/src/openai/types/realtime/conversation_item_truncate_event_param.py b/src/openai/types/realtime/conversation_item_truncate_event_param.py index d3ad1e1e25..f5ab13a419 100644 --- a/src/openai/types/realtime/conversation_item_truncate_event_param.py +++ b/src/openai/types/realtime/conversation_item_truncate_event_param.py @@ -16,7 +16,7 @@ class ConversationItemTruncateEventParam(TypedDict, total=False): """ content_index: Required[int] - """The index of the content part to truncate. Set this to 0.""" + """The index of the content part to truncate. Set this to `0`.""" item_id: Required[str] """The ID of the assistant message item to truncate. diff --git a/src/openai/types/realtime/models.py b/src/openai/types/realtime/models.py new file mode 100644 index 0000000000..d4827538a3 --- /dev/null +++ b/src/openai/types/realtime/models.py @@ -0,0 +1,25 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["Models"] + + +class Models(BaseModel): + description: Optional[str] = None + """ + The description of the function, including guidance on when and how to call it, + and guidance about what to tell the user when calling (if anything). + """ + + name: Optional[str] = None + """The name of the function.""" + + parameters: Optional[object] = None + """Parameters of the function in JSON Schema.""" + + type: Optional[Literal["function"]] = None + """The type of the tool, i.e. `function`.""" diff --git a/src/openai/types/realtime/models_param.py b/src/openai/types/realtime/models_param.py new file mode 100644 index 0000000000..1db2d7e464 --- /dev/null +++ b/src/openai/types/realtime/models_param.py @@ -0,0 +1,24 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import Literal, TypedDict + +__all__ = ["ModelsParam"] + + +class ModelsParam(TypedDict, total=False): + description: str + """ + The description of the function, including guidance on when and how to call it, + and guidance about what to tell the user when calling (if anything). + """ + + name: str + """The name of the function.""" + + parameters: object + """Parameters of the function in JSON Schema.""" + + type: Literal["function"] + """The type of the tool, i.e. `function`.""" diff --git a/src/openai/types/realtime/noise_reduction_type.py b/src/openai/types/realtime/noise_reduction_type.py new file mode 100644 index 0000000000..f4338991bb --- /dev/null +++ b/src/openai/types/realtime/noise_reduction_type.py @@ -0,0 +1,7 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing_extensions import Literal, TypeAlias + +__all__ = ["NoiseReductionType"] + +NoiseReductionType: TypeAlias = Literal["near_field", "far_field"] diff --git a/src/openai/types/realtime/realtime_audio_config.py b/src/openai/types/realtime/realtime_audio_config.py index 7463c70038..72d7cc59cc 100644 --- a/src/openai/types/realtime/realtime_audio_config.py +++ b/src/openai/types/realtime/realtime_audio_config.py @@ -1,184 +1,15 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Union, Optional -from typing_extensions import Literal +from typing import Optional from ..._models import BaseModel +from .realtime_audio_config_input import RealtimeAudioConfigInput +from .realtime_audio_config_output import RealtimeAudioConfigOutput -__all__ = ["RealtimeAudioConfig", "Input", "InputNoiseReduction", "InputTranscription", "InputTurnDetection", "Output"] - - -class InputNoiseReduction(BaseModel): - type: Optional[Literal["near_field", "far_field"]] = None - """Type of noise reduction. - - `near_field` is for close-talking microphones such as headphones, `far_field` is - for far-field microphones such as laptop or conference room microphones. - """ - - -class InputTranscription(BaseModel): - language: Optional[str] = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Optional[ - Literal[ - "whisper-1", - "gpt-4o-transcribe-latest", - "gpt-4o-mini-transcribe", - "gpt-4o-transcribe", - "gpt-4o-transcribe-diarize", - ] - ] = None - """The model to use for transcription. - - Current options are `whisper-1`, `gpt-4o-transcribe-latest`, - `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. - """ - - prompt: Optional[str] = None - """ - An optional text to guide the model's style or continue a previous audio - segment. For `whisper-1`, the - [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example - "expect words related to technology". - """ - - -class InputTurnDetection(BaseModel): - create_response: Optional[bool] = None - """ - Whether or not to automatically generate a response when a VAD stop event - occurs. - """ - - eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None - """Used only for `semantic_vad` mode. - - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. - """ - - idle_timeout_ms: Optional[int] = None - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received. - """ - - interrupt_response: Optional[bool] = None - """ - Whether or not to automatically interrupt any ongoing response with output to - the default conversation (i.e. `conversation` of `auto`) when a VAD start event - occurs. - """ - - prefix_padding_ms: Optional[int] = None - """Used only for `server_vad` mode. - - Amount of audio to include before the VAD detected speech (in milliseconds). - Defaults to 300ms. - """ - - silence_duration_ms: Optional[int] = None - """Used only for `server_vad` mode. - - Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. - With shorter values the model will respond more quickly, but may jump in on - short pauses from the user. - """ - - threshold: Optional[float] = None - """Used only for `server_vad` mode. - - Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher - threshold will require louder audio to activate the model, and thus might - perform better in noisy environments. - """ - - type: Optional[Literal["server_vad", "semantic_vad"]] = None - """Type of turn detection.""" - - -class Input(BaseModel): - format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None - """The format of input audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must - be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian - byte order. - """ - - noise_reduction: Optional[InputNoiseReduction] = None - """Configuration for input audio noise reduction. - - This can be set to `null` to turn off. Noise reduction filters audio added to - the input audio buffer before it is sent to VAD and the model. Filtering the - audio can improve VAD and turn detection accuracy (reducing false positives) and - model performance by improving perception of the input audio. - """ - - transcription: Optional[InputTranscription] = None - """ - Configuration for input audio transcription, defaults to off and can be set to - `null` to turn off once on. Input audio transcription is not native to the - model, since the model consumes audio directly. Transcription runs - asynchronously through - [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as guidance of input audio content rather than precisely - what the model heard. The client can optionally set the language and prompt for - transcription, these offer additional guidance to the transcription service. - """ - - turn_detection: Optional[InputTurnDetection] = None - """Configuration for turn detection, ether Server VAD or Semantic VAD. - - This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. - Semantic VAD is more advanced and uses a turn detection model (in conjunction - with VAD) to semantically estimate whether the user has finished speaking, then - dynamically sets a timeout based on this probability. For example, if user audio - trails off with "uhhm", the model will score a low probability of turn end and - wait longer for the user to continue speaking. This can be useful for more - natural conversations, but may have a higher latency. - """ - - -class Output(BaseModel): - format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None - """The format of output audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is - sampled at a rate of 24kHz. - """ - - speed: Optional[float] = None - """The speed of the model's spoken response. - - 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. - This value can only be changed in between model turns, not while a response is - in progress. - """ - - voice: Union[ - str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None - ] = None - """The voice the model uses to respond. - - Voice cannot be changed during the session once the model has responded with - audio at least once. Current voice options are `alloy`, `ash`, `ballad`, - `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. - """ +__all__ = ["RealtimeAudioConfig"] class RealtimeAudioConfig(BaseModel): - input: Optional[Input] = None + input: Optional[RealtimeAudioConfigInput] = None - output: Optional[Output] = None + output: Optional[RealtimeAudioConfigOutput] = None diff --git a/src/openai/types/realtime/realtime_audio_config_input.py b/src/openai/types/realtime/realtime_audio_config_input.py new file mode 100644 index 0000000000..fd96e2a52d --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_config_input.py @@ -0,0 +1,60 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel +from .audio_transcription import AudioTranscription +from .noise_reduction_type import NoiseReductionType +from .realtime_audio_formats import RealtimeAudioFormats +from .realtime_audio_input_turn_detection import RealtimeAudioInputTurnDetection + +__all__ = ["RealtimeAudioConfigInput", "NoiseReduction"] + + +class NoiseReduction(BaseModel): + type: Optional[NoiseReductionType] = None + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class RealtimeAudioConfigInput(BaseModel): + format: Optional[RealtimeAudioFormats] = None + """The format of the input audio.""" + + noise_reduction: Optional[NoiseReduction] = None + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + transcription: Optional[AudioTranscription] = None + """ + Configuration for input audio transcription, defaults to off and can be set to + `null` to turn off once on. Input audio transcription is not native to the + model, since the model consumes audio directly. Transcription runs + asynchronously through + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. + """ + + turn_detection: Optional[RealtimeAudioInputTurnDetection] = None + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ diff --git a/src/openai/types/realtime/realtime_audio_config_input_param.py b/src/openai/types/realtime/realtime_audio_config_input_param.py new file mode 100644 index 0000000000..1dfb439006 --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_config_input_param.py @@ -0,0 +1,61 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import TypedDict + +from .noise_reduction_type import NoiseReductionType +from .audio_transcription_param import AudioTranscriptionParam +from .realtime_audio_formats_param import RealtimeAudioFormatsParam +from .realtime_audio_input_turn_detection_param import RealtimeAudioInputTurnDetectionParam + +__all__ = ["RealtimeAudioConfigInputParam", "NoiseReduction"] + + +class NoiseReduction(TypedDict, total=False): + type: NoiseReductionType + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class RealtimeAudioConfigInputParam(TypedDict, total=False): + format: RealtimeAudioFormatsParam + """The format of the input audio.""" + + noise_reduction: NoiseReduction + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + transcription: AudioTranscriptionParam + """ + Configuration for input audio transcription, defaults to off and can be set to + `null` to turn off once on. Input audio transcription is not native to the + model, since the model consumes audio directly. Transcription runs + asynchronously through + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. + """ + + turn_detection: RealtimeAudioInputTurnDetectionParam + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ diff --git a/src/openai/types/realtime/realtime_audio_config_output.py b/src/openai/types/realtime/realtime_audio_config_output.py new file mode 100644 index 0000000000..a8af237c1d --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_config_output.py @@ -0,0 +1,36 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Union, Optional +from typing_extensions import Literal + +from ..._models import BaseModel +from .realtime_audio_formats import RealtimeAudioFormats + +__all__ = ["RealtimeAudioConfigOutput"] + + +class RealtimeAudioConfigOutput(BaseModel): + format: Optional[RealtimeAudioFormats] = None + """The format of the output audio.""" + + speed: Optional[float] = None + """ + The speed of the model's spoken response as a multiple of the original speed. + 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + This value can only be changed in between model turns, not while a response is + in progress. + + This parameter is a post-processing adjustment to the audio after it is + generated, it's also possible to prompt the model to speak faster or slower. + """ + + voice: Union[ + str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None + ] = None + """The voice the model uses to respond. + + Voice cannot be changed during the session once the model has responded with + audio at least once. Current voice options are `alloy`, `ash`, `ballad`, + `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend + `marin` and `cedar` for best quality. + """ diff --git a/src/openai/types/realtime/realtime_audio_config_output_param.py b/src/openai/types/realtime/realtime_audio_config_output_param.py new file mode 100644 index 0000000000..8e887d3464 --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_config_output_param.py @@ -0,0 +1,35 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Union +from typing_extensions import Literal, TypedDict + +from .realtime_audio_formats_param import RealtimeAudioFormatsParam + +__all__ = ["RealtimeAudioConfigOutputParam"] + + +class RealtimeAudioConfigOutputParam(TypedDict, total=False): + format: RealtimeAudioFormatsParam + """The format of the output audio.""" + + speed: float + """ + The speed of the model's spoken response as a multiple of the original speed. + 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + This value can only be changed in between model turns, not while a response is + in progress. + + This parameter is a post-processing adjustment to the audio after it is + generated, it's also possible to prompt the model to speak faster or slower. + """ + + voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]] + """The voice the model uses to respond. + + Voice cannot be changed during the session once the model has responded with + audio at least once. Current voice options are `alloy`, `ash`, `ballad`, + `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend + `marin` and `cedar` for best quality. + """ diff --git a/src/openai/types/realtime/realtime_audio_config_param.py b/src/openai/types/realtime/realtime_audio_config_param.py index 9f2e12e910..2c41de35ae 100644 --- a/src/openai/types/realtime/realtime_audio_config_param.py +++ b/src/openai/types/realtime/realtime_audio_config_param.py @@ -2,186 +2,15 @@ from __future__ import annotations -from typing import Union, Optional -from typing_extensions import Literal, TypedDict +from typing_extensions import TypedDict -__all__ = [ - "RealtimeAudioConfigParam", - "Input", - "InputNoiseReduction", - "InputTranscription", - "InputTurnDetection", - "Output", -] +from .realtime_audio_config_input_param import RealtimeAudioConfigInputParam +from .realtime_audio_config_output_param import RealtimeAudioConfigOutputParam - -class InputNoiseReduction(TypedDict, total=False): - type: Literal["near_field", "far_field"] - """Type of noise reduction. - - `near_field` is for close-talking microphones such as headphones, `far_field` is - for far-field microphones such as laptop or conference room microphones. - """ - - -class InputTranscription(TypedDict, total=False): - language: str - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Literal[ - "whisper-1", - "gpt-4o-transcribe-latest", - "gpt-4o-mini-transcribe", - "gpt-4o-transcribe", - "gpt-4o-transcribe-diarize", - ] - """The model to use for transcription. - - Current options are `whisper-1`, `gpt-4o-transcribe-latest`, - `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. - """ - - prompt: str - """ - An optional text to guide the model's style or continue a previous audio - segment. For `whisper-1`, the - [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example - "expect words related to technology". - """ - - -class InputTurnDetection(TypedDict, total=False): - create_response: bool - """ - Whether or not to automatically generate a response when a VAD stop event - occurs. - """ - - eagerness: Literal["low", "medium", "high", "auto"] - """Used only for `semantic_vad` mode. - - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. - """ - - idle_timeout_ms: Optional[int] - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received. - """ - - interrupt_response: bool - """ - Whether or not to automatically interrupt any ongoing response with output to - the default conversation (i.e. `conversation` of `auto`) when a VAD start event - occurs. - """ - - prefix_padding_ms: int - """Used only for `server_vad` mode. - - Amount of audio to include before the VAD detected speech (in milliseconds). - Defaults to 300ms. - """ - - silence_duration_ms: int - """Used only for `server_vad` mode. - - Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. - With shorter values the model will respond more quickly, but may jump in on - short pauses from the user. - """ - - threshold: float - """Used only for `server_vad` mode. - - Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher - threshold will require louder audio to activate the model, and thus might - perform better in noisy environments. - """ - - type: Literal["server_vad", "semantic_vad"] - """Type of turn detection.""" - - -class Input(TypedDict, total=False): - format: Literal["pcm16", "g711_ulaw", "g711_alaw"] - """The format of input audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must - be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian - byte order. - """ - - noise_reduction: InputNoiseReduction - """Configuration for input audio noise reduction. - - This can be set to `null` to turn off. Noise reduction filters audio added to - the input audio buffer before it is sent to VAD and the model. Filtering the - audio can improve VAD and turn detection accuracy (reducing false positives) and - model performance by improving perception of the input audio. - """ - - transcription: InputTranscription - """ - Configuration for input audio transcription, defaults to off and can be set to - `null` to turn off once on. Input audio transcription is not native to the - model, since the model consumes audio directly. Transcription runs - asynchronously through - [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as guidance of input audio content rather than precisely - what the model heard. The client can optionally set the language and prompt for - transcription, these offer additional guidance to the transcription service. - """ - - turn_detection: InputTurnDetection - """Configuration for turn detection, ether Server VAD or Semantic VAD. - - This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. - Semantic VAD is more advanced and uses a turn detection model (in conjunction - with VAD) to semantically estimate whether the user has finished speaking, then - dynamically sets a timeout based on this probability. For example, if user audio - trails off with "uhhm", the model will score a low probability of turn end and - wait longer for the user to continue speaking. This can be useful for more - natural conversations, but may have a higher latency. - """ - - -class Output(TypedDict, total=False): - format: Literal["pcm16", "g711_ulaw", "g711_alaw"] - """The format of output audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is - sampled at a rate of 24kHz. - """ - - speed: float - """The speed of the model's spoken response. - - 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. - This value can only be changed in between model turns, not while a response is - in progress. - """ - - voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]] - """The voice the model uses to respond. - - Voice cannot be changed during the session once the model has responded with - audio at least once. Current voice options are `alloy`, `ash`, `ballad`, - `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. - """ +__all__ = ["RealtimeAudioConfigParam"] class RealtimeAudioConfigParam(TypedDict, total=False): - input: Input + input: RealtimeAudioConfigInputParam - output: Output + output: RealtimeAudioConfigOutputParam diff --git a/src/openai/types/realtime/realtime_audio_formats.py b/src/openai/types/realtime/realtime_audio_formats.py new file mode 100644 index 0000000000..10f91883b6 --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_formats.py @@ -0,0 +1,30 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Union, Optional +from typing_extensions import Literal, Annotated, TypeAlias + +from ..._utils import PropertyInfo +from ..._models import BaseModel + +__all__ = ["RealtimeAudioFormats", "AudioPCM", "AudioPCMU", "AudioPCMA"] + + +class AudioPCM(BaseModel): + rate: Optional[Literal[24000]] = None + """The sample rate of the audio. Always `24000`.""" + + type: Optional[Literal["audio/pcm"]] = None + """The audio format. Always `audio/pcm`.""" + + +class AudioPCMU(BaseModel): + type: Optional[Literal["audio/pcmu"]] = None + """The audio format. Always `audio/pcmu`.""" + + +class AudioPCMA(BaseModel): + type: Optional[Literal["audio/pcma"]] = None + """The audio format. Always `audio/pcma`.""" + + +RealtimeAudioFormats: TypeAlias = Annotated[Union[AudioPCM, AudioPCMU, AudioPCMA], PropertyInfo(discriminator="type")] diff --git a/src/openai/types/realtime/realtime_audio_formats_param.py b/src/openai/types/realtime/realtime_audio_formats_param.py new file mode 100644 index 0000000000..cf58577f38 --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_formats_param.py @@ -0,0 +1,29 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Union +from typing_extensions import Literal, TypeAlias, TypedDict + +__all__ = ["RealtimeAudioFormatsParam", "AudioPCM", "AudioPCMU", "AudioPCMA"] + + +class AudioPCM(TypedDict, total=False): + rate: Literal[24000] + """The sample rate of the audio. Always `24000`.""" + + type: Literal["audio/pcm"] + """The audio format. Always `audio/pcm`.""" + + +class AudioPCMU(TypedDict, total=False): + type: Literal["audio/pcmu"] + """The audio format. Always `audio/pcmu`.""" + + +class AudioPCMA(TypedDict, total=False): + type: Literal["audio/pcma"] + """The audio format. Always `audio/pcma`.""" + + +RealtimeAudioFormatsParam: TypeAlias = Union[AudioPCM, AudioPCMU, AudioPCMA] diff --git a/src/openai/types/realtime/realtime_audio_input_turn_detection.py b/src/openai/types/realtime/realtime_audio_input_turn_detection.py new file mode 100644 index 0000000000..ea9423f6a1 --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_input_turn_detection.py @@ -0,0 +1,64 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["RealtimeAudioInputTurnDetection"] + + +class RealtimeAudioInputTurnDetection(BaseModel): + create_response: Optional[bool] = None + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + idle_timeout_ms: Optional[int] = None + """ + Optional idle timeout after which turn detection will auto-timeout when no + additional audio is received. + """ + + interrupt_response: Optional[bool] = None + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + prefix_padding_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ + + silence_duration_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ + + threshold: Optional[float] = None + """Used only for `server_vad` mode. + + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Optional[Literal["server_vad", "semantic_vad"]] = None + """Type of turn detection.""" diff --git a/src/openai/types/realtime/realtime_audio_input_turn_detection_param.py b/src/openai/types/realtime/realtime_audio_input_turn_detection_param.py new file mode 100644 index 0000000000..ec398f52e6 --- /dev/null +++ b/src/openai/types/realtime/realtime_audio_input_turn_detection_param.py @@ -0,0 +1,64 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal, TypedDict + +__all__ = ["RealtimeAudioInputTurnDetectionParam"] + + +class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False): + create_response: bool + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + idle_timeout_ms: Optional[int] + """ + Optional idle timeout after which turn detection will auto-timeout when no + additional audio is received. + """ + + interrupt_response: bool + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + prefix_padding_ms: int + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ + + silence_duration_ms: int + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ + + threshold: float + """Used only for `server_vad` mode. + + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Literal["server_vad", "semantic_vad"] + """Type of turn detection.""" diff --git a/src/openai/types/realtime/realtime_client_secret_config.py b/src/openai/types/realtime/realtime_client_secret_config.py deleted file mode 100644 index 29f8f57081..0000000000 --- a/src/openai/types/realtime/realtime_client_secret_config.py +++ /dev/null @@ -1,27 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Optional -from typing_extensions import Literal - -from ..._models import BaseModel - -__all__ = ["RealtimeClientSecretConfig", "ExpiresAfter"] - - -class ExpiresAfter(BaseModel): - anchor: Literal["created_at"] - """The anchor point for the ephemeral token expiration. - - Only `created_at` is currently supported. - """ - - seconds: Optional[int] = None - """The number of seconds from the anchor point to the expiration. - - Select a value between `10` and `7200`. - """ - - -class RealtimeClientSecretConfig(BaseModel): - expires_after: Optional[ExpiresAfter] = None - """Configuration for the ephemeral token expiration.""" diff --git a/src/openai/types/realtime/realtime_client_secret_config_param.py b/src/openai/types/realtime/realtime_client_secret_config_param.py deleted file mode 100644 index 30a80134ee..0000000000 --- a/src/openai/types/realtime/realtime_client_secret_config_param.py +++ /dev/null @@ -1,26 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import Literal, Required, TypedDict - -__all__ = ["RealtimeClientSecretConfigParam", "ExpiresAfter"] - - -class ExpiresAfter(TypedDict, total=False): - anchor: Required[Literal["created_at"]] - """The anchor point for the ephemeral token expiration. - - Only `created_at` is currently supported. - """ - - seconds: int - """The number of seconds from the anchor point to the expiration. - - Select a value between `10` and `7200`. - """ - - -class RealtimeClientSecretConfigParam(TypedDict, total=False): - expires_after: ExpiresAfter - """Configuration for the ephemeral token expiration.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_assistant_message.py b/src/openai/types/realtime/realtime_conversation_item_assistant_message.py index d0f37745ea..6b0f86ee32 100644 --- a/src/openai/types/realtime/realtime_conversation_item_assistant_message.py +++ b/src/openai/types/realtime/realtime_conversation_item_assistant_message.py @@ -9,11 +9,27 @@ class Content(BaseModel): + audio: Optional[str] = None + """ + Base64-encoded audio bytes, these will be parsed as the format specified in the + session output audio type configuration. This defaults to PCM 16-bit 24kHz mono + if not specified. + """ + text: Optional[str] = None """The text content.""" - type: Optional[Literal["text"]] = None - """The content type. Always `text` for assistant messages.""" + transcript: Optional[str] = None + """ + The transcript of the audio content, this will always be present if the output + type is `audio`. + """ + + type: Optional[Literal["output_text", "output_audio"]] = None + """ + The content type, `output_text` or `output_audio` depending on the session + `output_modalities` configuration. + """ class RealtimeConversationItemAssistantMessage(BaseModel): @@ -27,10 +43,16 @@ class RealtimeConversationItemAssistantMessage(BaseModel): """The type of the item. Always `message`.""" id: Optional[str] = None - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Optional[Literal["realtime.item"]] = None - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Optional[Literal["completed", "incomplete", "in_progress"]] = None """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_assistant_message_param.py b/src/openai/types/realtime/realtime_conversation_item_assistant_message_param.py index cfbd9cd2cf..93699afba2 100644 --- a/src/openai/types/realtime/realtime_conversation_item_assistant_message_param.py +++ b/src/openai/types/realtime/realtime_conversation_item_assistant_message_param.py @@ -9,11 +9,27 @@ class Content(TypedDict, total=False): + audio: str + """ + Base64-encoded audio bytes, these will be parsed as the format specified in the + session output audio type configuration. This defaults to PCM 16-bit 24kHz mono + if not specified. + """ + text: str """The text content.""" - type: Literal["text"] - """The content type. Always `text` for assistant messages.""" + transcript: str + """ + The transcript of the audio content, this will always be present if the output + type is `audio`. + """ + + type: Literal["output_text", "output_audio"] + """ + The content type, `output_text` or `output_audio` depending on the session + `output_modalities` configuration. + """ class RealtimeConversationItemAssistantMessageParam(TypedDict, total=False): @@ -27,10 +43,16 @@ class RealtimeConversationItemAssistantMessageParam(TypedDict, total=False): """The type of the item. Always `message`.""" id: str - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Literal["realtime.item"] - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Literal["completed", "incomplete", "in_progress"] """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_function_call.py b/src/openai/types/realtime/realtime_conversation_item_function_call.py index ce1c6d4cb2..279a2fcdc5 100644 --- a/src/openai/types/realtime/realtime_conversation_item_function_call.py +++ b/src/openai/types/realtime/realtime_conversation_item_function_call.py @@ -10,7 +10,11 @@ class RealtimeConversationItemFunctionCall(BaseModel): arguments: str - """The arguments of the function call.""" + """The arguments of the function call. + + This is a JSON-encoded string representing the arguments passed to the function, + for example `{"arg1": "value1", "arg2": 42}`. + """ name: str """The name of the function being called.""" @@ -19,13 +23,19 @@ class RealtimeConversationItemFunctionCall(BaseModel): """The type of the item. Always `function_call`.""" id: Optional[str] = None - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ call_id: Optional[str] = None """The ID of the function call.""" object: Optional[Literal["realtime.item"]] = None - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Optional[Literal["completed", "incomplete", "in_progress"]] = None """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_function_call_output.py b/src/openai/types/realtime/realtime_conversation_item_function_call_output.py index cea840fdba..4b6b15d0ad 100644 --- a/src/openai/types/realtime/realtime_conversation_item_function_call_output.py +++ b/src/openai/types/realtime/realtime_conversation_item_function_call_output.py @@ -13,16 +13,25 @@ class RealtimeConversationItemFunctionCallOutput(BaseModel): """The ID of the function call this output is for.""" output: str - """The output of the function call.""" + """ + The output of the function call, this is free text and can contain any + information or simply be empty. + """ type: Literal["function_call_output"] """The type of the item. Always `function_call_output`.""" id: Optional[str] = None - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Optional[Literal["realtime.item"]] = None - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Optional[Literal["completed", "incomplete", "in_progress"]] = None """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_function_call_output_param.py b/src/openai/types/realtime/realtime_conversation_item_function_call_output_param.py index a66c587fb6..56d62da563 100644 --- a/src/openai/types/realtime/realtime_conversation_item_function_call_output_param.py +++ b/src/openai/types/realtime/realtime_conversation_item_function_call_output_param.py @@ -12,16 +12,25 @@ class RealtimeConversationItemFunctionCallOutputParam(TypedDict, total=False): """The ID of the function call this output is for.""" output: Required[str] - """The output of the function call.""" + """ + The output of the function call, this is free text and can contain any + information or simply be empty. + """ type: Required[Literal["function_call_output"]] """The type of the item. Always `function_call_output`.""" id: str - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Literal["realtime.item"] - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Literal["completed", "incomplete", "in_progress"] """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_function_call_param.py b/src/openai/types/realtime/realtime_conversation_item_function_call_param.py index a4d6fb83ab..36a16a27b3 100644 --- a/src/openai/types/realtime/realtime_conversation_item_function_call_param.py +++ b/src/openai/types/realtime/realtime_conversation_item_function_call_param.py @@ -9,7 +9,11 @@ class RealtimeConversationItemFunctionCallParam(TypedDict, total=False): arguments: Required[str] - """The arguments of the function call.""" + """The arguments of the function call. + + This is a JSON-encoded string representing the arguments passed to the function, + for example `{"arg1": "value1", "arg2": 42}`. + """ name: Required[str] """The name of the function being called.""" @@ -18,13 +22,19 @@ class RealtimeConversationItemFunctionCallParam(TypedDict, total=False): """The type of the item. Always `function_call`.""" id: str - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ call_id: str """The ID of the function call.""" object: Literal["realtime.item"] - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Literal["completed", "incomplete", "in_progress"] """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_system_message.py b/src/openai/types/realtime/realtime_conversation_item_system_message.py index abc67f6c5f..7dac5c9fe2 100644 --- a/src/openai/types/realtime/realtime_conversation_item_system_message.py +++ b/src/openai/types/realtime/realtime_conversation_item_system_message.py @@ -27,10 +27,16 @@ class RealtimeConversationItemSystemMessage(BaseModel): """The type of the item. Always `message`.""" id: Optional[str] = None - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Optional[Literal["realtime.item"]] = None - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Optional[Literal["completed", "incomplete", "in_progress"]] = None """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_system_message_param.py b/src/openai/types/realtime/realtime_conversation_item_system_message_param.py index 2a1c442738..a2790fcf67 100644 --- a/src/openai/types/realtime/realtime_conversation_item_system_message_param.py +++ b/src/openai/types/realtime/realtime_conversation_item_system_message_param.py @@ -27,10 +27,16 @@ class RealtimeConversationItemSystemMessageParam(TypedDict, total=False): """The type of the item. Always `message`.""" id: str - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Literal["realtime.item"] - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Literal["completed", "incomplete", "in_progress"] """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_user_message.py b/src/openai/types/realtime/realtime_conversation_item_user_message.py index 48a6c6ec0a..30d9bb10e3 100644 --- a/src/openai/types/realtime/realtime_conversation_item_user_message.py +++ b/src/openai/types/realtime/realtime_conversation_item_user_message.py @@ -10,16 +10,37 @@ class Content(BaseModel): audio: Optional[str] = None - """Base64-encoded audio bytes (for `input_audio`).""" + """ + Base64-encoded audio bytes (for `input_audio`), these will be parsed as the + format specified in the session input audio type configuration. This defaults to + PCM 16-bit 24kHz mono if not specified. + """ + + detail: Optional[Literal["auto", "low", "high"]] = None + """The detail level of the image (for `input_image`). + + `auto` will default to `high`. + """ + + image_url: Optional[str] = None + """Base64-encoded image bytes (for `input_image`) as a data URI. + + For example `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...`. Supported + formats are PNG and JPEG. + """ text: Optional[str] = None """The text content (for `input_text`).""" transcript: Optional[str] = None - """Transcript of the audio (for `input_audio`).""" + """Transcript of the audio (for `input_audio`). - type: Optional[Literal["input_text", "input_audio"]] = None - """The content type (`input_text` or `input_audio`).""" + This is not sent to the model, but will be attached to the message item for + reference. + """ + + type: Optional[Literal["input_text", "input_audio", "input_image"]] = None + """The content type (`input_text`, `input_audio`, or `input_image`).""" class RealtimeConversationItemUserMessage(BaseModel): @@ -33,10 +54,16 @@ class RealtimeConversationItemUserMessage(BaseModel): """The type of the item. Always `message`.""" id: Optional[str] = None - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Optional[Literal["realtime.item"]] = None - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Optional[Literal["completed", "incomplete", "in_progress"]] = None """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_conversation_item_user_message_param.py b/src/openai/types/realtime/realtime_conversation_item_user_message_param.py index cff64a66bf..7d3b9bc137 100644 --- a/src/openai/types/realtime/realtime_conversation_item_user_message_param.py +++ b/src/openai/types/realtime/realtime_conversation_item_user_message_param.py @@ -10,16 +10,37 @@ class Content(TypedDict, total=False): audio: str - """Base64-encoded audio bytes (for `input_audio`).""" + """ + Base64-encoded audio bytes (for `input_audio`), these will be parsed as the + format specified in the session input audio type configuration. This defaults to + PCM 16-bit 24kHz mono if not specified. + """ + + detail: Literal["auto", "low", "high"] + """The detail level of the image (for `input_image`). + + `auto` will default to `high`. + """ + + image_url: str + """Base64-encoded image bytes (for `input_image`) as a data URI. + + For example `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...`. Supported + formats are PNG and JPEG. + """ text: str """The text content (for `input_text`).""" transcript: str - """Transcript of the audio (for `input_audio`).""" + """Transcript of the audio (for `input_audio`). - type: Literal["input_text", "input_audio"] - """The content type (`input_text` or `input_audio`).""" + This is not sent to the model, but will be attached to the message item for + reference. + """ + + type: Literal["input_text", "input_audio", "input_image"] + """The content type (`input_text`, `input_audio`, or `input_image`).""" class RealtimeConversationItemUserMessageParam(TypedDict, total=False): @@ -33,10 +54,16 @@ class RealtimeConversationItemUserMessageParam(TypedDict, total=False): """The type of the item. Always `message`.""" id: str - """The unique ID of the item.""" + """The unique ID of the item. + + This may be provided by the client or generated by the server. + """ object: Literal["realtime.item"] - """Identifier for the API object being returned - always `realtime.item`.""" + """Identifier for the API object being returned - always `realtime.item`. + + Optional when creating a new item. + """ status: Literal["completed", "incomplete", "in_progress"] """The status of the item. Has no effect on the conversation.""" diff --git a/src/openai/types/realtime/realtime_response.py b/src/openai/types/realtime/realtime_response.py index 54f5999b81..92d75491c0 100644 --- a/src/openai/types/realtime/realtime_response.py +++ b/src/openai/types/realtime/realtime_response.py @@ -6,15 +6,39 @@ from ..._models import BaseModel from ..shared.metadata import Metadata from .conversation_item import ConversationItem +from .realtime_audio_formats import RealtimeAudioFormats from .realtime_response_usage import RealtimeResponseUsage from .realtime_response_status import RealtimeResponseStatus -__all__ = ["RealtimeResponse"] +__all__ = ["RealtimeResponse", "Audio", "AudioOutput"] + + +class AudioOutput(BaseModel): + format: Optional[RealtimeAudioFormats] = None + """The format of the output audio.""" + + voice: Union[ + str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None + ] = None + """The voice the model uses to respond. + + Voice cannot be changed during the session once the model has responded with + audio at least once. Current voice options are `alloy`, `ash`, `ballad`, + `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend + `marin` and `cedar` for best quality. + """ + + +class Audio(BaseModel): + output: Optional[AudioOutput] = None class RealtimeResponse(BaseModel): id: Optional[str] = None - """The unique ID of the response.""" + """The unique ID of the response, will look like `resp_1234`.""" + + audio: Optional[Audio] = None + """Configuration for audio output.""" conversation_id: Optional[str] = None """ @@ -23,8 +47,7 @@ class RealtimeResponse(BaseModel): the default conversation and the value of `conversation_id` will be an id like `conv_1234`. If `none`, the response will not be added to any conversation and the value of `conversation_id` will be `null`. If responses are being triggered - by server VAD, the response will be added to the default conversation, thus the - `conversation_id` will be an id like `conv_1234`. + automatically by VAD the response will be added to the default conversation """ max_output_tokens: Union[int, Literal["inf"], None] = None @@ -43,22 +66,19 @@ class RealtimeResponse(BaseModel): a maximum length of 512 characters. """ - modalities: Optional[List[Literal["text", "audio"]]] = None - """The set of modalities the model used to respond. - - If there are multiple modalities, the model will pick one, for example if - `modalities` is `["text", "audio"]`, the model could be responding in either - text or audio. - """ - object: Optional[Literal["realtime.response"]] = None """The object type, must be `realtime.response`.""" output: Optional[List[ConversationItem]] = None """The list of output items generated by the response.""" - output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None - """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" + output_modalities: Optional[List[Literal["text", "audio"]]] = None + """ + The set of modalities the model used to respond, currently the only possible + values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text + transcript. Setting the output to mode `text` will disable audio output from the + model. + """ status: Optional[Literal["completed", "cancelled", "failed", "incomplete", "in_progress"]] = None """ @@ -69,9 +89,6 @@ class RealtimeResponse(BaseModel): status_details: Optional[RealtimeResponseStatus] = None """Additional details about the status.""" - temperature: Optional[float] = None - """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" - usage: Optional[RealtimeResponseUsage] = None """Usage statistics for the Response, this will correspond to billing. @@ -79,11 +96,3 @@ class RealtimeResponse(BaseModel): to the Conversation, thus output from previous turns (text and audio tokens) will become the input for later turns. """ - - voice: Union[ - str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None - ] = None - """ - The voice the model used to respond. Current voice options are `alloy`, `ash`, - `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`. - """ diff --git a/src/openai/types/realtime/realtime_response_create_audio_output.py b/src/openai/types/realtime/realtime_response_create_audio_output.py new file mode 100644 index 0000000000..48a5d67e20 --- /dev/null +++ b/src/openai/types/realtime/realtime_response_create_audio_output.py @@ -0,0 +1,29 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Union, Optional +from typing_extensions import Literal + +from ..._models import BaseModel +from .realtime_audio_formats import RealtimeAudioFormats + +__all__ = ["RealtimeResponseCreateAudioOutput", "Output"] + + +class Output(BaseModel): + format: Optional[RealtimeAudioFormats] = None + """The format of the output audio.""" + + voice: Union[ + str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None + ] = None + """The voice the model uses to respond. + + Voice cannot be changed during the session once the model has responded with + audio at least once. Current voice options are `alloy`, `ash`, `ballad`, + `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend + `marin` and `cedar` for best quality. + """ + + +class RealtimeResponseCreateAudioOutput(BaseModel): + output: Optional[Output] = None diff --git a/src/openai/types/realtime/realtime_response_create_audio_output_param.py b/src/openai/types/realtime/realtime_response_create_audio_output_param.py new file mode 100644 index 0000000000..9aa6d28835 --- /dev/null +++ b/src/openai/types/realtime/realtime_response_create_audio_output_param.py @@ -0,0 +1,28 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Union +from typing_extensions import Literal, TypedDict + +from .realtime_audio_formats_param import RealtimeAudioFormatsParam + +__all__ = ["RealtimeResponseCreateAudioOutputParam", "Output"] + + +class Output(TypedDict, total=False): + format: RealtimeAudioFormatsParam + """The format of the output audio.""" + + voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]] + """The voice the model uses to respond. + + Voice cannot be changed during the session once the model has responded with + audio at least once. Current voice options are `alloy`, `ash`, `ballad`, + `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend + `marin` and `cedar` for best quality. + """ + + +class RealtimeResponseCreateAudioOutputParam(TypedDict, total=False): + output: Output diff --git a/src/openai/types/realtime/realtime_response_create_mcp_tool.py b/src/openai/types/realtime/realtime_response_create_mcp_tool.py new file mode 100644 index 0000000000..119b4a455d --- /dev/null +++ b/src/openai/types/realtime/realtime_response_create_mcp_tool.py @@ -0,0 +1,135 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Dict, List, Union, Optional +from typing_extensions import Literal, TypeAlias + +from ..._models import BaseModel + +__all__ = [ + "RealtimeResponseCreateMcpTool", + "AllowedTools", + "AllowedToolsMcpToolFilter", + "RequireApproval", + "RequireApprovalMcpToolApprovalFilter", + "RequireApprovalMcpToolApprovalFilterAlways", + "RequireApprovalMcpToolApprovalFilterNever", +] + + +class AllowedToolsMcpToolFilter(BaseModel): + read_only: Optional[bool] = None + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. + """ + + tool_names: Optional[List[str]] = None + """List of allowed tool names.""" + + +AllowedTools: TypeAlias = Union[List[str], AllowedToolsMcpToolFilter, None] + + +class RequireApprovalMcpToolApprovalFilterAlways(BaseModel): + read_only: Optional[bool] = None + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. + """ + + tool_names: Optional[List[str]] = None + """List of allowed tool names.""" + + +class RequireApprovalMcpToolApprovalFilterNever(BaseModel): + read_only: Optional[bool] = None + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. + """ + + tool_names: Optional[List[str]] = None + """List of allowed tool names.""" + + +class RequireApprovalMcpToolApprovalFilter(BaseModel): + always: Optional[RequireApprovalMcpToolApprovalFilterAlways] = None + """A filter object to specify which tools are allowed.""" + + never: Optional[RequireApprovalMcpToolApprovalFilterNever] = None + """A filter object to specify which tools are allowed.""" + + +RequireApproval: TypeAlias = Union[RequireApprovalMcpToolApprovalFilter, Literal["always", "never"], None] + + +class RealtimeResponseCreateMcpTool(BaseModel): + server_label: str + """A label for this MCP server, used to identify it in tool calls.""" + + type: Literal["mcp"] + """The type of the MCP tool. Always `mcp`.""" + + allowed_tools: Optional[AllowedTools] = None + """List of allowed tool names or a filter object.""" + + authorization: Optional[str] = None + """ + An OAuth access token that can be used with a remote MCP server, either with a + custom MCP server URL or a service connector. Your application must handle the + OAuth authorization flow and provide the token here. + """ + + connector_id: Optional[ + Literal[ + "connector_dropbox", + "connector_gmail", + "connector_googlecalendar", + "connector_googledrive", + "connector_microsoftteams", + "connector_outlookcalendar", + "connector_outlookemail", + "connector_sharepoint", + ] + ] = None + """Identifier for service connectors, like those available in ChatGPT. + + One of `server_url` or `connector_id` must be provided. Learn more about service + connectors + [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors). + + Currently supported `connector_id` values are: + + - Dropbox: `connector_dropbox` + - Gmail: `connector_gmail` + - Google Calendar: `connector_googlecalendar` + - Google Drive: `connector_googledrive` + - Microsoft Teams: `connector_microsoftteams` + - Outlook Calendar: `connector_outlookcalendar` + - Outlook Email: `connector_outlookemail` + - SharePoint: `connector_sharepoint` + """ + + headers: Optional[Dict[str, str]] = None + """Optional HTTP headers to send to the MCP server. + + Use for authentication or other purposes. + """ + + require_approval: Optional[RequireApproval] = None + """Specify which of the MCP server's tools require approval.""" + + server_description: Optional[str] = None + """Optional description of the MCP server, used to provide more context.""" + + server_url: Optional[str] = None + """The URL for the MCP server. + + One of `server_url` or `connector_id` must be provided. + """ diff --git a/src/openai/types/realtime/realtime_response_create_mcp_tool_param.py b/src/openai/types/realtime/realtime_response_create_mcp_tool_param.py new file mode 100644 index 0000000000..3b9cf047c1 --- /dev/null +++ b/src/openai/types/realtime/realtime_response_create_mcp_tool_param.py @@ -0,0 +1,135 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Dict, Union, Optional +from typing_extensions import Literal, Required, TypeAlias, TypedDict + +from ..._types import SequenceNotStr + +__all__ = [ + "RealtimeResponseCreateMcpToolParam", + "AllowedTools", + "AllowedToolsMcpToolFilter", + "RequireApproval", + "RequireApprovalMcpToolApprovalFilter", + "RequireApprovalMcpToolApprovalFilterAlways", + "RequireApprovalMcpToolApprovalFilterNever", +] + + +class AllowedToolsMcpToolFilter(TypedDict, total=False): + read_only: bool + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. + """ + + tool_names: SequenceNotStr[str] + """List of allowed tool names.""" + + +AllowedTools: TypeAlias = Union[SequenceNotStr[str], AllowedToolsMcpToolFilter] + + +class RequireApprovalMcpToolApprovalFilterAlways(TypedDict, total=False): + read_only: bool + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. + """ + + tool_names: SequenceNotStr[str] + """List of allowed tool names.""" + + +class RequireApprovalMcpToolApprovalFilterNever(TypedDict, total=False): + read_only: bool + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. + """ + + tool_names: SequenceNotStr[str] + """List of allowed tool names.""" + + +class RequireApprovalMcpToolApprovalFilter(TypedDict, total=False): + always: RequireApprovalMcpToolApprovalFilterAlways + """A filter object to specify which tools are allowed.""" + + never: RequireApprovalMcpToolApprovalFilterNever + """A filter object to specify which tools are allowed.""" + + +RequireApproval: TypeAlias = Union[RequireApprovalMcpToolApprovalFilter, Literal["always", "never"]] + + +class RealtimeResponseCreateMcpToolParam(TypedDict, total=False): + server_label: Required[str] + """A label for this MCP server, used to identify it in tool calls.""" + + type: Required[Literal["mcp"]] + """The type of the MCP tool. Always `mcp`.""" + + allowed_tools: Optional[AllowedTools] + """List of allowed tool names or a filter object.""" + + authorization: str + """ + An OAuth access token that can be used with a remote MCP server, either with a + custom MCP server URL or a service connector. Your application must handle the + OAuth authorization flow and provide the token here. + """ + + connector_id: Literal[ + "connector_dropbox", + "connector_gmail", + "connector_googlecalendar", + "connector_googledrive", + "connector_microsoftteams", + "connector_outlookcalendar", + "connector_outlookemail", + "connector_sharepoint", + ] + """Identifier for service connectors, like those available in ChatGPT. + + One of `server_url` or `connector_id` must be provided. Learn more about service + connectors + [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors). + + Currently supported `connector_id` values are: + + - Dropbox: `connector_dropbox` + - Gmail: `connector_gmail` + - Google Calendar: `connector_googlecalendar` + - Google Drive: `connector_googledrive` + - Microsoft Teams: `connector_microsoftteams` + - Outlook Calendar: `connector_outlookcalendar` + - Outlook Email: `connector_outlookemail` + - SharePoint: `connector_sharepoint` + """ + + headers: Optional[Dict[str, str]] + """Optional HTTP headers to send to the MCP server. + + Use for authentication or other purposes. + """ + + require_approval: Optional[RequireApproval] + """Specify which of the MCP server's tools require approval.""" + + server_description: str + """Optional description of the MCP server, used to provide more context.""" + + server_url: str + """The URL for the MCP server. + + One of `server_url` or `connector_id` must be provided. + """ diff --git a/src/openai/types/realtime/realtime_response_create_params.py b/src/openai/types/realtime/realtime_response_create_params.py new file mode 100644 index 0000000000..3b5a8907a1 --- /dev/null +++ b/src/openai/types/realtime/realtime_response_create_params.py @@ -0,0 +1,98 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Union, Optional +from typing_extensions import Literal, TypeAlias + +from .models import Models +from ..._models import BaseModel +from ..shared.metadata import Metadata +from .conversation_item import ConversationItem +from ..responses.response_prompt import ResponsePrompt +from ..responses.tool_choice_mcp import ToolChoiceMcp +from ..responses.tool_choice_options import ToolChoiceOptions +from ..responses.tool_choice_function import ToolChoiceFunction +from .realtime_response_create_mcp_tool import RealtimeResponseCreateMcpTool +from .realtime_response_create_audio_output import RealtimeResponseCreateAudioOutput + +__all__ = ["RealtimeResponseCreateParams", "ToolChoice", "Tool"] + +ToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunction, ToolChoiceMcp] + +Tool: TypeAlias = Union[Models, RealtimeResponseCreateMcpTool] + + +class RealtimeResponseCreateParams(BaseModel): + audio: Optional[RealtimeResponseCreateAudioOutput] = None + """Configuration for audio input and output.""" + + conversation: Union[str, Literal["auto", "none"], None] = None + """Controls which conversation the response is added to. + + Currently supports `auto` and `none`, with `auto` as the default value. The + `auto` value means that the contents of the response will be added to the + default conversation. Set this to `none` to create an out-of-band response which + will not add items to default conversation. + """ + + input: Optional[List[ConversationItem]] = None + """Input items to include in the prompt for the model. + + Using this field creates a new context for this Response instead of using the + default conversation. An empty array `[]` will clear the context for this + Response. Note that this can include references to items that previously + appeared in the session using their id. + """ + + instructions: Optional[str] = None + """The default system instructions (i.e. + + system message) prepended to model calls. This field allows the client to guide + the model on desired responses. The model can be instructed on response content + and format, (e.g. "be extremely succinct", "act friendly", "here are examples of + good responses") and on audio behavior (e.g. "talk quickly", "inject emotion + into your voice", "laugh frequently"). The instructions are not guaranteed to be + followed by the model, but they provide guidance to the model on the desired + behavior. Note that the server sets default instructions which will be used if + this field is not set and are visible in the `session.created` event at the + start of the session. + """ + + max_output_tokens: Union[int, Literal["inf"], None] = None + """ + Maximum number of output tokens for a single assistant response, inclusive of + tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + `inf` for the maximum available tokens for a given model. Defaults to `inf`. + """ + + metadata: Optional[Metadata] = None + """Set of 16 key-value pairs that can be attached to an object. + + This can be useful for storing additional information about the object in a + structured format, and querying for objects via API or the dashboard. + + Keys are strings with a maximum length of 64 characters. Values are strings with + a maximum length of 512 characters. + """ + + output_modalities: Optional[List[Literal["text", "audio"]]] = None + """ + The set of modalities the model used to respond, currently the only possible + values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text + transcript. Setting the output to mode `text` will disable audio output from the + model. + """ + + prompt: Optional[ResponsePrompt] = None + """Reference to a prompt template and its variables. + + [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). + """ + + tool_choice: Optional[ToolChoice] = None + """How the model chooses tools. + + Provide one of the string modes or force a specific function/MCP tool. + """ + + tools: Optional[List[Tool]] = None + """Tools available to the model.""" diff --git a/src/openai/types/realtime/realtime_response_create_params_param.py b/src/openai/types/realtime/realtime_response_create_params_param.py new file mode 100644 index 0000000000..6800d36a31 --- /dev/null +++ b/src/openai/types/realtime/realtime_response_create_params_param.py @@ -0,0 +1,99 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import List, Union, Iterable, Optional +from typing_extensions import Literal, TypeAlias, TypedDict + +from .models_param import ModelsParam +from ..shared_params.metadata import Metadata +from .conversation_item_param import ConversationItemParam +from ..responses.tool_choice_options import ToolChoiceOptions +from ..responses.response_prompt_param import ResponsePromptParam +from ..responses.tool_choice_mcp_param import ToolChoiceMcpParam +from ..responses.tool_choice_function_param import ToolChoiceFunctionParam +from .realtime_response_create_mcp_tool_param import RealtimeResponseCreateMcpToolParam +from .realtime_response_create_audio_output_param import RealtimeResponseCreateAudioOutputParam + +__all__ = ["RealtimeResponseCreateParamsParam", "ToolChoice", "Tool"] + +ToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunctionParam, ToolChoiceMcpParam] + +Tool: TypeAlias = Union[ModelsParam, RealtimeResponseCreateMcpToolParam] + + +class RealtimeResponseCreateParamsParam(TypedDict, total=False): + audio: RealtimeResponseCreateAudioOutputParam + """Configuration for audio input and output.""" + + conversation: Union[str, Literal["auto", "none"]] + """Controls which conversation the response is added to. + + Currently supports `auto` and `none`, with `auto` as the default value. The + `auto` value means that the contents of the response will be added to the + default conversation. Set this to `none` to create an out-of-band response which + will not add items to default conversation. + """ + + input: Iterable[ConversationItemParam] + """Input items to include in the prompt for the model. + + Using this field creates a new context for this Response instead of using the + default conversation. An empty array `[]` will clear the context for this + Response. Note that this can include references to items that previously + appeared in the session using their id. + """ + + instructions: str + """The default system instructions (i.e. + + system message) prepended to model calls. This field allows the client to guide + the model on desired responses. The model can be instructed on response content + and format, (e.g. "be extremely succinct", "act friendly", "here are examples of + good responses") and on audio behavior (e.g. "talk quickly", "inject emotion + into your voice", "laugh frequently"). The instructions are not guaranteed to be + followed by the model, but they provide guidance to the model on the desired + behavior. Note that the server sets default instructions which will be used if + this field is not set and are visible in the `session.created` event at the + start of the session. + """ + + max_output_tokens: Union[int, Literal["inf"]] + """ + Maximum number of output tokens for a single assistant response, inclusive of + tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + `inf` for the maximum available tokens for a given model. Defaults to `inf`. + """ + + metadata: Optional[Metadata] + """Set of 16 key-value pairs that can be attached to an object. + + This can be useful for storing additional information about the object in a + structured format, and querying for objects via API or the dashboard. + + Keys are strings with a maximum length of 64 characters. Values are strings with + a maximum length of 512 characters. + """ + + output_modalities: List[Literal["text", "audio"]] + """ + The set of modalities the model used to respond, currently the only possible + values are `[\"audio\"]`, `[\"text\"]`. Audio output always include a text + transcript. Setting the output to mode `text` will disable audio output from the + model. + """ + + prompt: Optional[ResponsePromptParam] + """Reference to a prompt template and its variables. + + [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). + """ + + tool_choice: ToolChoice + """How the model chooses tools. + + Provide one of the string modes or force a specific function/MCP tool. + """ + + tools: Iterable[Tool] + """Tools available to the model.""" diff --git a/src/openai/types/realtime/realtime_response_usage.py b/src/openai/types/realtime/realtime_response_usage.py index dbce5f28c3..fb8893b346 100644 --- a/src/openai/types/realtime/realtime_response_usage.py +++ b/src/openai/types/realtime/realtime_response_usage.py @@ -11,7 +11,13 @@ class RealtimeResponseUsage(BaseModel): input_token_details: Optional[RealtimeResponseUsageInputTokenDetails] = None - """Details about the input tokens used in the Response.""" + """Details about the input tokens used in the Response. + + Cached tokens are tokens from previous turns in the conversation that are + included as context for the current response. Cached tokens here are counted as + a subset of input tokens, meaning input tokens will include cached and uncached + tokens. + """ input_tokens: Optional[int] = None """ diff --git a/src/openai/types/realtime/realtime_response_usage_input_token_details.py b/src/openai/types/realtime/realtime_response_usage_input_token_details.py index dfeead90ef..e14a74a84e 100644 --- a/src/openai/types/realtime/realtime_response_usage_input_token_details.py +++ b/src/openai/types/realtime/realtime_response_usage_input_token_details.py @@ -4,15 +4,32 @@ from ..._models import BaseModel -__all__ = ["RealtimeResponseUsageInputTokenDetails"] +__all__ = ["RealtimeResponseUsageInputTokenDetails", "CachedTokensDetails"] + + +class CachedTokensDetails(BaseModel): + audio_tokens: Optional[int] = None + """The number of cached audio tokens used as input for the Response.""" + + image_tokens: Optional[int] = None + """The number of cached image tokens used as input for the Response.""" + + text_tokens: Optional[int] = None + """The number of cached text tokens used as input for the Response.""" class RealtimeResponseUsageInputTokenDetails(BaseModel): audio_tokens: Optional[int] = None - """The number of audio tokens used in the Response.""" + """The number of audio tokens used as input for the Response.""" cached_tokens: Optional[int] = None - """The number of cached tokens used in the Response.""" + """The number of cached tokens used as input for the Response.""" + + cached_tokens_details: Optional[CachedTokensDetails] = None + """Details about the cached tokens used as input for the Response.""" + + image_tokens: Optional[int] = None + """The number of image tokens used as input for the Response.""" text_tokens: Optional[int] = None - """The number of text tokens used in the Response.""" + """The number of text tokens used as input for the Response.""" diff --git a/src/openai/types/realtime/realtime_session.py b/src/openai/types/realtime/realtime_session.py deleted file mode 100644 index fdb5e9419a..0000000000 --- a/src/openai/types/realtime/realtime_session.py +++ /dev/null @@ -1,307 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List, Union, Optional -from typing_extensions import Literal, TypeAlias - -from ..._models import BaseModel -from ..responses.response_prompt import ResponsePrompt - -__all__ = [ - "RealtimeSession", - "InputAudioNoiseReduction", - "InputAudioTranscription", - "Tool", - "Tracing", - "TracingTracingConfiguration", - "TurnDetection", -] - - -class InputAudioNoiseReduction(BaseModel): - type: Optional[Literal["near_field", "far_field"]] = None - """Type of noise reduction. - - `near_field` is for close-talking microphones such as headphones, `far_field` is - for far-field microphones such as laptop or conference room microphones. - """ - - -class InputAudioTranscription(BaseModel): - language: Optional[str] = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Optional[str] = None - """ - The model to use for transcription, current options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1`. - """ - - prompt: Optional[str] = None - """ - An optional text to guide the model's style or continue a previous audio - segment. For `whisper-1`, the - [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example - "expect words related to technology". - """ - - -class Tool(BaseModel): - description: Optional[str] = None - """ - The description of the function, including guidance on when and how to call it, - and guidance about what to tell the user when calling (if anything). - """ - - name: Optional[str] = None - """The name of the function.""" - - parameters: Optional[object] = None - """Parameters of the function in JSON Schema.""" - - type: Optional[Literal["function"]] = None - """The type of the tool, i.e. `function`.""" - - -class TracingTracingConfiguration(BaseModel): - group_id: Optional[str] = None - """ - The group id to attach to this trace to enable filtering and grouping in the - traces dashboard. - """ - - metadata: Optional[object] = None - """ - The arbitrary metadata to attach to this trace to enable filtering in the traces - dashboard. - """ - - workflow_name: Optional[str] = None - """The name of the workflow to attach to this trace. - - This is used to name the trace in the traces dashboard. - """ - - -Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration, None] - - -class TurnDetection(BaseModel): - create_response: Optional[bool] = None - """ - Whether or not to automatically generate a response when a VAD stop event - occurs. - """ - - eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None - """Used only for `semantic_vad` mode. - - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. - """ - - idle_timeout_ms: Optional[int] = None - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received. - """ - - interrupt_response: Optional[bool] = None - """ - Whether or not to automatically interrupt any ongoing response with output to - the default conversation (i.e. `conversation` of `auto`) when a VAD start event - occurs. - """ - - prefix_padding_ms: Optional[int] = None - """Used only for `server_vad` mode. - - Amount of audio to include before the VAD detected speech (in milliseconds). - Defaults to 300ms. - """ - - silence_duration_ms: Optional[int] = None - """Used only for `server_vad` mode. - - Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. - With shorter values the model will respond more quickly, but may jump in on - short pauses from the user. - """ - - threshold: Optional[float] = None - """Used only for `server_vad` mode. - - Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher - threshold will require louder audio to activate the model, and thus might - perform better in noisy environments. - """ - - type: Optional[Literal["server_vad", "semantic_vad"]] = None - """Type of turn detection.""" - - -class RealtimeSession(BaseModel): - id: Optional[str] = None - """Unique identifier for the session that looks like `sess_1234567890abcdef`.""" - - expires_at: Optional[int] = None - """Expiration timestamp for the session, in seconds since epoch.""" - - include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None - """Additional fields to include in server outputs. - - - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - transcription. - """ - - input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None - """The format of input audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must - be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian - byte order. - """ - - input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None - """Configuration for input audio noise reduction. - - This can be set to `null` to turn off. Noise reduction filters audio added to - the input audio buffer before it is sent to VAD and the model. Filtering the - audio can improve VAD and turn detection accuracy (reducing false positives) and - model performance by improving perception of the input audio. - """ - - input_audio_transcription: Optional[InputAudioTranscription] = None - """ - Configuration for input audio transcription, defaults to off and can be set to - `null` to turn off once on. Input audio transcription is not native to the - model, since the model consumes audio directly. Transcription runs - asynchronously through - [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) - and should be treated as guidance of input audio content rather than precisely - what the model heard. The client can optionally set the language and prompt for - transcription, these offer additional guidance to the transcription service. - """ - - instructions: Optional[str] = None - """The default system instructions (i.e. - - system message) prepended to model calls. This field allows the client to guide - the model on desired responses. The model can be instructed on response content - and format, (e.g. "be extremely succinct", "act friendly", "here are examples of - good responses") and on audio behavior (e.g. "talk quickly", "inject emotion - into your voice", "laugh frequently"). The instructions are not guaranteed to be - followed by the model, but they provide guidance to the model on the desired - behavior. - - Note that the server sets default instructions which will be used if this field - is not set and are visible in the `session.created` event at the start of the - session. - """ - - max_response_output_tokens: Union[int, Literal["inf"], None] = None - """ - Maximum number of output tokens for a single assistant response, inclusive of - tool calls. Provide an integer between 1 and 4096 to limit output tokens, or - `inf` for the maximum available tokens for a given model. Defaults to `inf`. - """ - - modalities: Optional[List[Literal["text", "audio"]]] = None - """The set of modalities the model can respond with. - - To disable audio, set this to ["text"]. - """ - - model: Optional[ - Literal[ - "gpt-realtime", - "gpt-realtime-2025-08-28", - "gpt-4o-realtime-preview", - "gpt-4o-realtime-preview-2024-10-01", - "gpt-4o-realtime-preview-2024-12-17", - "gpt-4o-realtime-preview-2025-06-03", - "gpt-4o-mini-realtime-preview", - "gpt-4o-mini-realtime-preview-2024-12-17", - ] - ] = None - """The Realtime model used for this session.""" - - object: Optional[Literal["realtime.session"]] = None - """The object type. Always `realtime.session`.""" - - output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None - """The format of output audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, output audio is - sampled at a rate of 24kHz. - """ - - prompt: Optional[ResponsePrompt] = None - """Reference to a prompt template and its variables. - - [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). - """ - - speed: Optional[float] = None - """The speed of the model's spoken response. - - 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. - This value can only be changed in between model turns, not while a response is - in progress. - """ - - temperature: Optional[float] = None - """Sampling temperature for the model, limited to [0.6, 1.2]. - - For audio models a temperature of 0.8 is highly recommended for best - performance. - """ - - tool_choice: Optional[str] = None - """How the model chooses tools. - - Options are `auto`, `none`, `required`, or specify a function. - """ - - tools: Optional[List[Tool]] = None - """Tools (functions) available to the model.""" - - tracing: Optional[Tracing] = None - """Configuration options for tracing. - - Set to null to disable tracing. Once tracing is enabled for a session, the - configuration cannot be modified. - - `auto` will create a trace for the session with default values for the workflow - name, group id, and metadata. - """ - - turn_detection: Optional[TurnDetection] = None - """Configuration for turn detection, ether Server VAD or Semantic VAD. - - This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. - Semantic VAD is more advanced and uses a turn detection model (in conjunction - with VAD) to semantically estimate whether the user has finished speaking, then - dynamically sets a timeout based on this probability. For example, if user audio - trails off with "uhhm", the model will score a low probability of turn end and - wait longer for the user to continue speaking. This can be useful for more - natural conversations, but may have a higher latency. - """ - - voice: Union[ - str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None - ] = None - """The voice the model uses to respond. - - Voice cannot be changed during the session once the model has responded with - audio at least once. Current voice options are `alloy`, `ash`, `ballad`, - `coral`, `echo`, `sage`, `shimmer`, and `verse`. - """ diff --git a/src/openai/types/realtime/realtime_session_client_secret.py b/src/openai/types/realtime/realtime_session_client_secret.py new file mode 100644 index 0000000000..a4998802bb --- /dev/null +++ b/src/openai/types/realtime/realtime_session_client_secret.py @@ -0,0 +1,20 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from ..._models import BaseModel + +__all__ = ["RealtimeSessionClientSecret"] + + +class RealtimeSessionClientSecret(BaseModel): + expires_at: int + """Timestamp for when the token expires. + + Currently, all tokens expire after one minute. + """ + + value: str + """ + Ephemeral key usable in client environments to authenticate connections to the + Realtime API. Use this in client-side environments rather than a standard API + token, which should only be used server-side. + """ diff --git a/src/openai/types/realtime/realtime_session_create_request.py b/src/openai/types/realtime/realtime_session_create_request.py index 85205add50..578bc43821 100644 --- a/src/openai/types/realtime/realtime_session_create_request.py +++ b/src/openai/types/realtime/realtime_session_create_request.py @@ -10,43 +10,22 @@ from .realtime_tracing_config import RealtimeTracingConfig from ..responses.response_prompt import ResponsePrompt from .realtime_tool_choice_config import RealtimeToolChoiceConfig -from .realtime_client_secret_config import RealtimeClientSecretConfig __all__ = ["RealtimeSessionCreateRequest"] class RealtimeSessionCreateRequest(BaseModel): - model: Union[ - str, - Literal[ - "gpt-realtime", - "gpt-realtime-2025-08-28", - "gpt-4o-realtime", - "gpt-4o-mini-realtime", - "gpt-4o-realtime-preview", - "gpt-4o-realtime-preview-2024-10-01", - "gpt-4o-realtime-preview-2024-12-17", - "gpt-4o-realtime-preview-2025-06-03", - "gpt-4o-mini-realtime-preview", - "gpt-4o-mini-realtime-preview-2024-12-17", - ], - ] - """The Realtime model used for this session.""" - type: Literal["realtime"] """The type of session to create. Always `realtime` for the Realtime API.""" audio: Optional[RealtimeAudioConfig] = None """Configuration for input and output audio.""" - client_secret: Optional[RealtimeClientSecretConfig] = None - """Configuration options for the generated client secret.""" - include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None """Additional fields to include in server outputs. - - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - transcription. + `item.input_audio_transcription.logprobs`: Include logprobs for input audio + transcription. """ instructions: Optional[str] = None @@ -72,10 +51,28 @@ class RealtimeSessionCreateRequest(BaseModel): `inf` for the maximum available tokens for a given model. Defaults to `inf`. """ + model: Union[ + str, + Literal[ + "gpt-realtime", + "gpt-realtime-2025-08-28", + "gpt-4o-realtime-preview", + "gpt-4o-realtime-preview-2024-10-01", + "gpt-4o-realtime-preview-2024-12-17", + "gpt-4o-realtime-preview-2025-06-03", + "gpt-4o-mini-realtime-preview", + "gpt-4o-mini-realtime-preview-2024-12-17", + ], + None, + ] = None + """The Realtime model used for this session.""" + output_modalities: Optional[List[Literal["text", "audio"]]] = None """The set of modalities the model can respond with. - To disable audio, set this to ["text"]. + It defaults to `["audio"]`, indicating that the model will respond with audio + plus a transcript. `["text"]` can be used to make the model respond with text + only. It is not possible to request both `text` and `audio` at the same time. """ prompt: Optional[ResponsePrompt] = None @@ -84,13 +81,6 @@ class RealtimeSessionCreateRequest(BaseModel): [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). """ - temperature: Optional[float] = None - """Sampling temperature for the model, limited to [0.6, 1.2]. - - For audio models a temperature of 0.8 is highly recommended for best - performance. - """ - tool_choice: Optional[RealtimeToolChoiceConfig] = None """How the model chooses tools. @@ -101,10 +91,10 @@ class RealtimeSessionCreateRequest(BaseModel): """Tools available to the model.""" tracing: Optional[RealtimeTracingConfig] = None - """Configuration options for tracing. - - Set to null to disable tracing. Once tracing is enabled for a session, the - configuration cannot be modified. + """ + Realtime API can write session traces to the + [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once + tracing is enabled for a session, the configuration cannot be modified. `auto` will create a trace for the session with default values for the workflow name, group id, and metadata. @@ -113,6 +103,5 @@ class RealtimeSessionCreateRequest(BaseModel): truncation: Optional[RealtimeTruncation] = None """ Controls how the realtime conversation is truncated prior to model inference. - The default is `auto`. When set to `retention_ratio`, the server retains a - fraction of the conversation tokens prior to the instructions. + The default is `auto`. """ diff --git a/src/openai/types/realtime/realtime_session_create_request_param.py b/src/openai/types/realtime/realtime_session_create_request_param.py index 8f962ca0e2..5f7819fa61 100644 --- a/src/openai/types/realtime/realtime_session_create_request_param.py +++ b/src/openai/types/realtime/realtime_session_create_request_param.py @@ -11,45 +11,22 @@ from .realtime_tracing_config_param import RealtimeTracingConfigParam from ..responses.response_prompt_param import ResponsePromptParam from .realtime_tool_choice_config_param import RealtimeToolChoiceConfigParam -from .realtime_client_secret_config_param import RealtimeClientSecretConfigParam __all__ = ["RealtimeSessionCreateRequestParam"] class RealtimeSessionCreateRequestParam(TypedDict, total=False): - model: Required[ - Union[ - str, - Literal[ - "gpt-realtime", - "gpt-realtime-2025-08-28", - "gpt-4o-realtime", - "gpt-4o-mini-realtime", - "gpt-4o-realtime-preview", - "gpt-4o-realtime-preview-2024-10-01", - "gpt-4o-realtime-preview-2024-12-17", - "gpt-4o-realtime-preview-2025-06-03", - "gpt-4o-mini-realtime-preview", - "gpt-4o-mini-realtime-preview-2024-12-17", - ], - ] - ] - """The Realtime model used for this session.""" - type: Required[Literal["realtime"]] """The type of session to create. Always `realtime` for the Realtime API.""" audio: RealtimeAudioConfigParam """Configuration for input and output audio.""" - client_secret: RealtimeClientSecretConfigParam - """Configuration options for the generated client secret.""" - include: List[Literal["item.input_audio_transcription.logprobs"]] """Additional fields to include in server outputs. - - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - transcription. + `item.input_audio_transcription.logprobs`: Include logprobs for input audio + transcription. """ instructions: str @@ -75,10 +52,27 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False): `inf` for the maximum available tokens for a given model. Defaults to `inf`. """ + model: Union[ + str, + Literal[ + "gpt-realtime", + "gpt-realtime-2025-08-28", + "gpt-4o-realtime-preview", + "gpt-4o-realtime-preview-2024-10-01", + "gpt-4o-realtime-preview-2024-12-17", + "gpt-4o-realtime-preview-2025-06-03", + "gpt-4o-mini-realtime-preview", + "gpt-4o-mini-realtime-preview-2024-12-17", + ], + ] + """The Realtime model used for this session.""" + output_modalities: List[Literal["text", "audio"]] """The set of modalities the model can respond with. - To disable audio, set this to ["text"]. + It defaults to `["audio"]`, indicating that the model will respond with audio + plus a transcript. `["text"]` can be used to make the model respond with text + only. It is not possible to request both `text` and `audio` at the same time. """ prompt: Optional[ResponsePromptParam] @@ -87,13 +81,6 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False): [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). """ - temperature: float - """Sampling temperature for the model, limited to [0.6, 1.2]. - - For audio models a temperature of 0.8 is highly recommended for best - performance. - """ - tool_choice: RealtimeToolChoiceConfigParam """How the model chooses tools. @@ -104,10 +91,10 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False): """Tools available to the model.""" tracing: Optional[RealtimeTracingConfigParam] - """Configuration options for tracing. - - Set to null to disable tracing. Once tracing is enabled for a session, the - configuration cannot be modified. + """ + Realtime API can write session traces to the + [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once + tracing is enabled for a session, the configuration cannot be modified. `auto` will create a trace for the session with default values for the workflow name, group id, and metadata. @@ -116,6 +103,5 @@ class RealtimeSessionCreateRequestParam(TypedDict, total=False): truncation: RealtimeTruncationParam """ Controls how the realtime conversation is truncated prior to model inference. - The default is `auto`. When set to `retention_ratio`, the server retains a - fraction of the conversation tokens prior to the instructions. + The default is `auto`. """ diff --git a/src/openai/types/realtime/realtime_session_create_response.py b/src/openai/types/realtime/realtime_session_create_response.py index 82fa426982..9c10b84588 100644 --- a/src/openai/types/realtime/realtime_session_create_response.py +++ b/src/openai/types/realtime/realtime_session_create_response.py @@ -1,74 +1,171 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Union, Optional +from typing import Dict, List, Union, Optional from typing_extensions import Literal, TypeAlias +from .models import Models from ..._models import BaseModel +from .audio_transcription import AudioTranscription +from .realtime_truncation import RealtimeTruncation +from .noise_reduction_type import NoiseReductionType +from .realtime_audio_formats import RealtimeAudioFormats +from ..responses.response_prompt import ResponsePrompt +from ..responses.tool_choice_mcp import ToolChoiceMcp +from ..responses.tool_choice_options import ToolChoiceOptions +from .realtime_session_client_secret import RealtimeSessionClientSecret +from ..responses.tool_choice_function import ToolChoiceFunction __all__ = [ "RealtimeSessionCreateResponse", "Audio", "AudioInput", "AudioInputNoiseReduction", - "AudioInputTranscription", "AudioInputTurnDetection", "AudioOutput", + "ToolChoice", "Tool", + "ToolMcpTool", + "ToolMcpToolAllowedTools", + "ToolMcpToolAllowedToolsMcpToolFilter", + "ToolMcpToolRequireApproval", + "ToolMcpToolRequireApprovalMcpToolApprovalFilter", + "ToolMcpToolRequireApprovalMcpToolApprovalFilterAlways", + "ToolMcpToolRequireApprovalMcpToolApprovalFilterNever", "Tracing", "TracingTracingConfiguration", - "TurnDetection", ] class AudioInputNoiseReduction(BaseModel): - type: Optional[Literal["near_field", "far_field"]] = None + type: Optional[NoiseReductionType] = None + """Type of noise reduction. + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ -class AudioInputTranscription(BaseModel): - language: Optional[str] = None - """The language of the input audio.""" - model: Optional[str] = None - """The model to use for transcription.""" +class AudioInputTurnDetection(BaseModel): + create_response: Optional[bool] = None + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ - prompt: Optional[str] = None - """Optional text to guide the model's style or continue a previous audio segment.""" + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + idle_timeout_ms: Optional[int] = None + """ + Optional idle timeout after which turn detection will auto-timeout when no + additional audio is received. + """ + + interrupt_response: Optional[bool] = None + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ -class AudioInputTurnDetection(BaseModel): prefix_padding_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ silence_duration_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ threshold: Optional[float] = None + """Used only for `server_vad` mode. - type: Optional[str] = None - """Type of turn detection, only `server_vad` is currently supported.""" + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Optional[Literal["server_vad", "semantic_vad"]] = None + """Type of turn detection.""" class AudioInput(BaseModel): - format: Optional[str] = None - """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" + format: Optional[RealtimeAudioFormats] = None + """The format of the input audio.""" noise_reduction: Optional[AudioInputNoiseReduction] = None - """Configuration for input audio noise reduction.""" + """Configuration for input audio noise reduction. - transcription: Optional[AudioInputTranscription] = None - """Configuration for input audio transcription.""" + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + transcription: Optional[AudioTranscription] = None + """ + Configuration for input audio transcription, defaults to off and can be set to + `null` to turn off once on. Input audio transcription is not native to the + model, since the model consumes audio directly. Transcription runs + asynchronously through + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. + """ turn_detection: Optional[AudioInputTurnDetection] = None - """Configuration for turn detection.""" + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ class AudioOutput(BaseModel): - format: Optional[str] = None - """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" + format: Optional[RealtimeAudioFormats] = None + """The format of the output audio.""" speed: Optional[float] = None + """ + The speed of the model's spoken response as a multiple of the original speed. + 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + This value can only be changed in between model turns, not while a response is + in progress. + + This parameter is a post-processing adjustment to the audio after it is + generated, it's also possible to prompt the model to speak faster or slower. + """ voice: Union[ str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None ] = None + """The voice the model uses to respond. + + Voice cannot be changed during the session once the model has responded with + audio at least once. Current voice options are `alloy`, `ash`, `ballad`, + `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. We recommend + `marin` and `cedar` for best quality. + """ class Audio(BaseModel): @@ -77,86 +174,168 @@ class Audio(BaseModel): output: Optional[AudioOutput] = None -class Tool(BaseModel): - description: Optional[str] = None - """ - The description of the function, including guidance on when and how to call it, - and guidance about what to tell the user when calling (if anything). +ToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunction, ToolChoiceMcp] + + +class ToolMcpToolAllowedToolsMcpToolFilter(BaseModel): + read_only: Optional[bool] = None + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. """ - name: Optional[str] = None - """The name of the function.""" + tool_names: Optional[List[str]] = None + """List of allowed tool names.""" - parameters: Optional[object] = None - """Parameters of the function in JSON Schema.""" - type: Optional[Literal["function"]] = None - """The type of the tool, i.e. `function`.""" +ToolMcpToolAllowedTools: TypeAlias = Union[List[str], ToolMcpToolAllowedToolsMcpToolFilter, None] -class TracingTracingConfiguration(BaseModel): - group_id: Optional[str] = None +class ToolMcpToolRequireApprovalMcpToolApprovalFilterAlways(BaseModel): + read_only: Optional[bool] = None + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. """ - The group id to attach to this trace to enable filtering and grouping in the - traces dashboard. + + tool_names: Optional[List[str]] = None + """List of allowed tool names.""" + + +class ToolMcpToolRequireApprovalMcpToolApprovalFilterNever(BaseModel): + read_only: Optional[bool] = None + """Indicates whether or not a tool modifies data or is read-only. + + If an MCP server is + [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint), + it will match this filter. """ - metadata: Optional[object] = None + tool_names: Optional[List[str]] = None + """List of allowed tool names.""" + + +class ToolMcpToolRequireApprovalMcpToolApprovalFilter(BaseModel): + always: Optional[ToolMcpToolRequireApprovalMcpToolApprovalFilterAlways] = None + """A filter object to specify which tools are allowed.""" + + never: Optional[ToolMcpToolRequireApprovalMcpToolApprovalFilterNever] = None + """A filter object to specify which tools are allowed.""" + + +ToolMcpToolRequireApproval: TypeAlias = Union[ + ToolMcpToolRequireApprovalMcpToolApprovalFilter, Literal["always", "never"], None +] + + +class ToolMcpTool(BaseModel): + server_label: str + """A label for this MCP server, used to identify it in tool calls.""" + + type: Literal["mcp"] + """The type of the MCP tool. Always `mcp`.""" + + allowed_tools: Optional[ToolMcpToolAllowedTools] = None + """List of allowed tool names or a filter object.""" + + authorization: Optional[str] = None """ - The arbitrary metadata to attach to this trace to enable filtering in the traces - dashboard. + An OAuth access token that can be used with a remote MCP server, either with a + custom MCP server URL or a service connector. Your application must handle the + OAuth authorization flow and provide the token here. """ - workflow_name: Optional[str] = None - """The name of the workflow to attach to this trace. - - This is used to name the trace in the traces dashboard. + connector_id: Optional[ + Literal[ + "connector_dropbox", + "connector_gmail", + "connector_googlecalendar", + "connector_googledrive", + "connector_microsoftteams", + "connector_outlookcalendar", + "connector_outlookemail", + "connector_sharepoint", + ] + ] = None + """Identifier for service connectors, like those available in ChatGPT. + + One of `server_url` or `connector_id` must be provided. Learn more about service + connectors + [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors). + + Currently supported `connector_id` values are: + + - Dropbox: `connector_dropbox` + - Gmail: `connector_gmail` + - Google Calendar: `connector_googlecalendar` + - Google Drive: `connector_googledrive` + - Microsoft Teams: `connector_microsoftteams` + - Outlook Calendar: `connector_outlookcalendar` + - Outlook Email: `connector_outlookemail` + - SharePoint: `connector_sharepoint` """ + headers: Optional[Dict[str, str]] = None + """Optional HTTP headers to send to the MCP server. + + Use for authentication or other purposes. + """ -Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration] + require_approval: Optional[ToolMcpToolRequireApproval] = None + """Specify which of the MCP server's tools require approval.""" + server_description: Optional[str] = None + """Optional description of the MCP server, used to provide more context.""" -class TurnDetection(BaseModel): - prefix_padding_ms: Optional[int] = None - """Amount of audio to include before the VAD detected speech (in milliseconds). + server_url: Optional[str] = None + """The URL for the MCP server. - Defaults to 300ms. + One of `server_url` or `connector_id` must be provided. """ - silence_duration_ms: Optional[int] = None - """Duration of silence to detect speech stop (in milliseconds). - Defaults to 500ms. With shorter values the model will respond more quickly, but - may jump in on short pauses from the user. +Tool: TypeAlias = Union[Models, ToolMcpTool] + + +class TracingTracingConfiguration(BaseModel): + group_id: Optional[str] = None + """ + The group id to attach to this trace to enable filtering and grouping in the + Traces Dashboard. """ - threshold: Optional[float] = None - """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + metadata: Optional[object] = None + """ + The arbitrary metadata to attach to this trace to enable filtering in the Traces + Dashboard. + """ - A higher threshold will require louder audio to activate the model, and thus - might perform better in noisy environments. + workflow_name: Optional[str] = None + """The name of the workflow to attach to this trace. + + This is used to name the trace in the Traces Dashboard. """ - type: Optional[str] = None - """Type of turn detection, only `server_vad` is currently supported.""" +Tracing: TypeAlias = Union[Literal["auto"], TracingTracingConfiguration, None] -class RealtimeSessionCreateResponse(BaseModel): - id: Optional[str] = None - """Unique identifier for the session that looks like `sess_1234567890abcdef`.""" +class RealtimeSessionCreateResponse(BaseModel): audio: Optional[Audio] = None - """Configuration for input and output audio for the session.""" + """Configuration for input and output audio.""" - expires_at: Optional[int] = None - """Expiration timestamp for the session, in seconds since epoch.""" + client_secret: Optional[RealtimeSessionClientSecret] = None + """Ephemeral key returned by the API.""" include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None """Additional fields to include in server outputs. - - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - transcription. + `item.input_audio_transcription.logprobs`: Include logprobs for input audio + transcription. """ instructions: Optional[str] = None @@ -182,41 +361,60 @@ class RealtimeSessionCreateResponse(BaseModel): `inf` for the maximum available tokens for a given model. Defaults to `inf`. """ - model: Optional[str] = None + model: Union[ + str, + Literal[ + "gpt-realtime", + "gpt-realtime-2025-08-28", + "gpt-4o-realtime-preview", + "gpt-4o-realtime-preview-2024-10-01", + "gpt-4o-realtime-preview-2024-12-17", + "gpt-4o-realtime-preview-2025-06-03", + "gpt-4o-mini-realtime-preview", + "gpt-4o-mini-realtime-preview-2024-12-17", + ], + None, + ] = None """The Realtime model used for this session.""" - object: Optional[str] = None - """The object type. Always `realtime.session`.""" - output_modalities: Optional[List[Literal["text", "audio"]]] = None """The set of modalities the model can respond with. - To disable audio, set this to ["text"]. + It defaults to `["audio"]`, indicating that the model will respond with audio + plus a transcript. `["text"]` can be used to make the model respond with text + only. It is not possible to request both `text` and `audio` at the same time. + """ + + prompt: Optional[ResponsePrompt] = None + """Reference to a prompt template and its variables. + + [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). """ - tool_choice: Optional[str] = None + tool_choice: Optional[ToolChoice] = None """How the model chooses tools. - Options are `auto`, `none`, `required`, or specify a function. + Provide one of the string modes or force a specific function/MCP tool. """ tools: Optional[List[Tool]] = None - """Tools (functions) available to the model.""" + """Tools available to the model.""" tracing: Optional[Tracing] = None - """Configuration options for tracing. - - Set to null to disable tracing. Once tracing is enabled for a session, the - configuration cannot be modified. + """ + Realtime API can write session traces to the + [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once + tracing is enabled for a session, the configuration cannot be modified. `auto` will create a trace for the session with default values for the workflow name, group id, and metadata. """ - turn_detection: Optional[TurnDetection] = None - """Configuration for turn detection. - - Can be set to `null` to turn off. Server VAD means that the model will detect - the start and end of speech based on audio volume and respond at the end of user - speech. + truncation: Optional[RealtimeTruncation] = None """ + Controls how the realtime conversation is truncated prior to model inference. + The default is `auto`. + """ + + type: Optional[Literal["realtime"]] = None + """The type of session to create. Always `realtime` for the Realtime API.""" diff --git a/src/openai/types/realtime/realtime_tools_config_param.py b/src/openai/types/realtime/realtime_tools_config_param.py index ea4b8c4d43..700b548fe2 100644 --- a/src/openai/types/realtime/realtime_tools_config_param.py +++ b/src/openai/types/realtime/realtime_tools_config_param.py @@ -6,11 +6,11 @@ from typing_extensions import Literal, Required, TypeAlias, TypedDict from ..._types import SequenceNotStr +from .models_param import ModelsParam __all__ = [ "RealtimeToolsConfigParam", "RealtimeToolsConfigUnionParam", - "Function", "Mcp", "McpAllowedTools", "McpAllowedToolsMcpToolFilter", @@ -21,23 +21,6 @@ ] -class Function(TypedDict, total=False): - description: str - """ - The description of the function, including guidance on when and how to call it, - and guidance about what to tell the user when calling (if anything). - """ - - name: str - """The name of the function.""" - - parameters: object - """Parameters of the function in JSON Schema.""" - - type: Literal["function"] - """The type of the tool, i.e. `function`.""" - - class McpAllowedToolsMcpToolFilter(TypedDict, total=False): read_only: bool """Indicates whether or not a tool modifies data or is read-only. @@ -155,6 +138,6 @@ class Mcp(TypedDict, total=False): """ -RealtimeToolsConfigUnionParam: TypeAlias = Union[Function, Mcp] +RealtimeToolsConfigUnionParam: TypeAlias = Union[ModelsParam, Mcp] RealtimeToolsConfigParam: TypeAlias = List[RealtimeToolsConfigUnionParam] diff --git a/src/openai/types/realtime/realtime_tools_config_union.py b/src/openai/types/realtime/realtime_tools_config_union.py index 16b1557743..8a064d78d4 100644 --- a/src/openai/types/realtime/realtime_tools_config_union.py +++ b/src/openai/types/realtime/realtime_tools_config_union.py @@ -3,12 +3,12 @@ from typing import Dict, List, Union, Optional from typing_extensions import Literal, Annotated, TypeAlias +from .models import Models from ..._utils import PropertyInfo from ..._models import BaseModel __all__ = [ "RealtimeToolsConfigUnion", - "Function", "Mcp", "McpAllowedTools", "McpAllowedToolsMcpToolFilter", @@ -19,23 +19,6 @@ ] -class Function(BaseModel): - description: Optional[str] = None - """ - The description of the function, including guidance on when and how to call it, - and guidance about what to tell the user when calling (if anything). - """ - - name: Optional[str] = None - """The name of the function.""" - - parameters: Optional[object] = None - """Parameters of the function in JSON Schema.""" - - type: Optional[Literal["function"]] = None - """The type of the tool, i.e. `function`.""" - - class McpAllowedToolsMcpToolFilter(BaseModel): read_only: Optional[bool] = None """Indicates whether or not a tool modifies data or is read-only. @@ -155,4 +138,4 @@ class Mcp(BaseModel): """ -RealtimeToolsConfigUnion: TypeAlias = Annotated[Union[Function, Mcp], PropertyInfo(discriminator="type")] +RealtimeToolsConfigUnion: TypeAlias = Annotated[Union[Models, Mcp], PropertyInfo(discriminator="type")] diff --git a/src/openai/types/realtime/realtime_tools_config_union_param.py b/src/openai/types/realtime/realtime_tools_config_union_param.py index 21b4d07752..179ad040d9 100644 --- a/src/openai/types/realtime/realtime_tools_config_union_param.py +++ b/src/openai/types/realtime/realtime_tools_config_union_param.py @@ -6,10 +6,10 @@ from typing_extensions import Literal, Required, TypeAlias, TypedDict from ..._types import SequenceNotStr +from .models_param import ModelsParam __all__ = [ "RealtimeToolsConfigUnionParam", - "Function", "Mcp", "McpAllowedTools", "McpAllowedToolsMcpToolFilter", @@ -20,23 +20,6 @@ ] -class Function(TypedDict, total=False): - description: str - """ - The description of the function, including guidance on when and how to call it, - and guidance about what to tell the user when calling (if anything). - """ - - name: str - """The name of the function.""" - - parameters: object - """Parameters of the function in JSON Schema.""" - - type: Literal["function"] - """The type of the tool, i.e. `function`.""" - - class McpAllowedToolsMcpToolFilter(TypedDict, total=False): read_only: bool """Indicates whether or not a tool modifies data or is read-only. @@ -154,4 +137,4 @@ class Mcp(TypedDict, total=False): """ -RealtimeToolsConfigUnionParam: TypeAlias = Union[Function, Mcp] +RealtimeToolsConfigUnionParam: TypeAlias = Union[ModelsParam, Mcp] diff --git a/src/openai/types/realtime/realtime_tracing_config.py b/src/openai/types/realtime/realtime_tracing_config.py index 1de24d6e5f..1c46de7928 100644 --- a/src/openai/types/realtime/realtime_tracing_config.py +++ b/src/openai/types/realtime/realtime_tracing_config.py @@ -12,19 +12,19 @@ class TracingConfiguration(BaseModel): group_id: Optional[str] = None """ The group id to attach to this trace to enable filtering and grouping in the - traces dashboard. + Traces Dashboard. """ metadata: Optional[object] = None """ - The arbitrary metadata to attach to this trace to enable filtering in the traces - dashboard. + The arbitrary metadata to attach to this trace to enable filtering in the Traces + Dashboard. """ workflow_name: Optional[str] = None """The name of the workflow to attach to this trace. - This is used to name the trace in the traces dashboard. + This is used to name the trace in the Traces Dashboard. """ diff --git a/src/openai/types/realtime/realtime_tracing_config_param.py b/src/openai/types/realtime/realtime_tracing_config_param.py index 3a35c6f7fa..fd9e266244 100644 --- a/src/openai/types/realtime/realtime_tracing_config_param.py +++ b/src/openai/types/realtime/realtime_tracing_config_param.py @@ -12,19 +12,19 @@ class TracingConfiguration(TypedDict, total=False): group_id: str """ The group id to attach to this trace to enable filtering and grouping in the - traces dashboard. + Traces Dashboard. """ metadata: object """ - The arbitrary metadata to attach to this trace to enable filtering in the traces - dashboard. + The arbitrary metadata to attach to this trace to enable filtering in the Traces + Dashboard. """ workflow_name: str """The name of the workflow to attach to this trace. - This is used to name the trace in the traces dashboard. + This is used to name the trace in the Traces Dashboard. """ diff --git a/src/openai/types/realtime/realtime_transcription_session_audio.py b/src/openai/types/realtime/realtime_transcription_session_audio.py new file mode 100644 index 0000000000..a5506947f1 --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_audio.py @@ -0,0 +1,12 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel +from .realtime_transcription_session_audio_input import RealtimeTranscriptionSessionAudioInput + +__all__ = ["RealtimeTranscriptionSessionAudio"] + + +class RealtimeTranscriptionSessionAudio(BaseModel): + input: Optional[RealtimeTranscriptionSessionAudioInput] = None diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input.py b/src/openai/types/realtime/realtime_transcription_session_audio_input.py new file mode 100644 index 0000000000..0ae92959aa --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input.py @@ -0,0 +1,62 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel +from .audio_transcription import AudioTranscription +from .noise_reduction_type import NoiseReductionType +from .realtime_audio_formats import RealtimeAudioFormats +from .realtime_transcription_session_audio_input_turn_detection import ( + RealtimeTranscriptionSessionAudioInputTurnDetection, +) + +__all__ = ["RealtimeTranscriptionSessionAudioInput", "NoiseReduction"] + + +class NoiseReduction(BaseModel): + type: Optional[NoiseReductionType] = None + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class RealtimeTranscriptionSessionAudioInput(BaseModel): + format: Optional[RealtimeAudioFormats] = None + """The PCM audio format. Only a 24kHz sample rate is supported.""" + + noise_reduction: Optional[NoiseReduction] = None + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + transcription: Optional[AudioTranscription] = None + """ + Configuration for input audio transcription, defaults to off and can be set to + `null` to turn off once on. Input audio transcription is not native to the + model, since the model consumes audio directly. Transcription runs + asynchronously through + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. + """ + + turn_detection: Optional[RealtimeTranscriptionSessionAudioInputTurnDetection] = None + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input_param.py b/src/openai/types/realtime/realtime_transcription_session_audio_input_param.py new file mode 100644 index 0000000000..a8263789dc --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input_param.py @@ -0,0 +1,63 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import TypedDict + +from .noise_reduction_type import NoiseReductionType +from .audio_transcription_param import AudioTranscriptionParam +from .realtime_audio_formats_param import RealtimeAudioFormatsParam +from .realtime_transcription_session_audio_input_turn_detection_param import ( + RealtimeTranscriptionSessionAudioInputTurnDetectionParam, +) + +__all__ = ["RealtimeTranscriptionSessionAudioInputParam", "NoiseReduction"] + + +class NoiseReduction(TypedDict, total=False): + type: NoiseReductionType + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class RealtimeTranscriptionSessionAudioInputParam(TypedDict, total=False): + format: RealtimeAudioFormatsParam + """The PCM audio format. Only a 24kHz sample rate is supported.""" + + noise_reduction: NoiseReduction + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + transcription: AudioTranscriptionParam + """ + Configuration for input audio transcription, defaults to off and can be set to + `null` to turn off once on. Input audio transcription is not native to the + model, since the model consumes audio directly. Transcription runs + asynchronously through + [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) + and should be treated as guidance of input audio content rather than precisely + what the model heard. The client can optionally set the language and prompt for + transcription, these offer additional guidance to the transcription service. + """ + + turn_detection: RealtimeTranscriptionSessionAudioInputTurnDetectionParam + """Configuration for turn detection, ether Server VAD or Semantic VAD. + + This can be set to `null` to turn off, in which case the client must manually + trigger model response. Server VAD means that the model will detect the start + and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction + with VAD) to semantically estimate whether the user has finished speaking, then + dynamically sets a timeout based on this probability. For example, if user audio + trails off with "uhhm", the model will score a low probability of turn end and + wait longer for the user to continue speaking. This can be useful for more + natural conversations, but may have a higher latency. + """ diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py new file mode 100644 index 0000000000..0cac36f7a3 --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py @@ -0,0 +1,63 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetection"] + + +class RealtimeTranscriptionSessionAudioInputTurnDetection(BaseModel): + create_response: Optional[bool] = None + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. + """ + + idle_timeout_ms: Optional[int] = None + """ + Optional idle timeout after which turn detection will auto-timeout when no + additional audio is received. + """ + + interrupt_response: Optional[bool] = None + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + prefix_padding_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ + + silence_duration_ms: Optional[int] = None + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ + + threshold: Optional[float] = None + """Used only for `server_vad` mode. + + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Optional[Literal["server_vad", "semantic_vad"]] = None + """Type of turn detection.""" diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py new file mode 100644 index 0000000000..e76dc9a8fe --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py @@ -0,0 +1,63 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing import Optional +from typing_extensions import Literal, TypedDict + +__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetectionParam"] + + +class RealtimeTranscriptionSessionAudioInputTurnDetectionParam(TypedDict, total=False): + create_response: bool + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. + """ + + idle_timeout_ms: Optional[int] + """ + Optional idle timeout after which turn detection will auto-timeout when no + additional audio is received. + """ + + interrupt_response: bool + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + prefix_padding_ms: int + """Used only for `server_vad` mode. + + Amount of audio to include before the VAD detected speech (in milliseconds). + Defaults to 300ms. + """ + + silence_duration_ms: int + """Used only for `server_vad` mode. + + Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + With shorter values the model will respond more quickly, but may jump in on + short pauses from the user. + """ + + threshold: float + """Used only for `server_vad` mode. + + Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + threshold will require louder audio to activate the model, and thus might + perform better in noisy environments. + """ + + type: Literal["server_vad", "semantic_vad"] + """Type of turn detection.""" diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_param.py b/src/openai/types/realtime/realtime_transcription_session_audio_param.py new file mode 100644 index 0000000000..1503a606d3 --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_audio_param.py @@ -0,0 +1,13 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import TypedDict + +from .realtime_transcription_session_audio_input_param import RealtimeTranscriptionSessionAudioInputParam + +__all__ = ["RealtimeTranscriptionSessionAudioParam"] + + +class RealtimeTranscriptionSessionAudioParam(TypedDict, total=False): + input: RealtimeTranscriptionSessionAudioInputParam diff --git a/src/openai/types/realtime/realtime_transcription_session_client_secret.py b/src/openai/types/realtime/realtime_transcription_session_client_secret.py new file mode 100644 index 0000000000..0cfde4c0a2 --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_client_secret.py @@ -0,0 +1,20 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from ..._models import BaseModel + +__all__ = ["RealtimeTranscriptionSessionClientSecret"] + + +class RealtimeTranscriptionSessionClientSecret(BaseModel): + expires_at: int + """Timestamp for when the token expires. + + Currently, all tokens expire after one minute. + """ + + value: str + """ + Ephemeral key usable in client environments to authenticate connections to the + Realtime API. Use this in client-side environments rather than a standard API + token, which should only be used server-side. + """ diff --git a/src/openai/types/realtime/realtime_transcription_session_create_request.py b/src/openai/types/realtime/realtime_transcription_session_create_request.py index d67bc92708..102f2b14fb 100644 --- a/src/openai/types/realtime/realtime_transcription_session_create_request.py +++ b/src/openai/types/realtime/realtime_transcription_session_create_request.py @@ -1,128 +1,27 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Union, Optional +from typing import List, Optional from typing_extensions import Literal from ..._models import BaseModel +from .realtime_transcription_session_audio import RealtimeTranscriptionSessionAudio -__all__ = [ - "RealtimeTranscriptionSessionCreateRequest", - "InputAudioNoiseReduction", - "InputAudioTranscription", - "TurnDetection", -] - - -class InputAudioNoiseReduction(BaseModel): - type: Optional[Literal["near_field", "far_field"]] = None - """Type of noise reduction. - - `near_field` is for close-talking microphones such as headphones, `far_field` is - for far-field microphones such as laptop or conference room microphones. - """ - - -class InputAudioTranscription(BaseModel): - language: Optional[str] = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None - """ - The model to use for transcription, current options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1`. - """ - - prompt: Optional[str] = None - """ - An optional text to guide the model's style or continue a previous audio - segment. For `whisper-1`, the - [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example - "expect words related to technology". - """ - - -class TurnDetection(BaseModel): - prefix_padding_ms: Optional[int] = None - """Amount of audio to include before the VAD detected speech (in milliseconds). - - Defaults to 300ms. - """ - - silence_duration_ms: Optional[int] = None - """Duration of silence to detect speech stop (in milliseconds). - - Defaults to 500ms. With shorter values the model will respond more quickly, but - may jump in on short pauses from the user. - """ - - threshold: Optional[float] = None - """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. - - A higher threshold will require louder audio to activate the model, and thus - might perform better in noisy environments. - """ - - type: Optional[Literal["server_vad"]] = None - """Type of turn detection. - - Only `server_vad` is currently supported for transcription sessions. - """ +__all__ = ["RealtimeTranscriptionSessionCreateRequest"] class RealtimeTranscriptionSessionCreateRequest(BaseModel): - model: Union[str, Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]] - """ID of the model to use. - - The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` - (which is powered by our open source Whisper V2 model). - """ - type: Literal["transcription"] """The type of session to create. Always `transcription` for transcription sessions. """ - include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None - """The set of items to include in the transcription. Current available items are: - - - `item.input_audio_transcription.logprobs` - """ + audio: Optional[RealtimeTranscriptionSessionAudio] = None + """Configuration for input and output audio.""" - input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None - """The format of input audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must - be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian - byte order. - """ - - input_audio_noise_reduction: Optional[InputAudioNoiseReduction] = None - """Configuration for input audio noise reduction. - - This can be set to `null` to turn off. Noise reduction filters audio added to - the input audio buffer before it is sent to VAD and the model. Filtering the - audio can improve VAD and turn detection accuracy (reducing false positives) and - model performance by improving perception of the input audio. - """ - - input_audio_transcription: Optional[InputAudioTranscription] = None - """Configuration for input audio transcription. - - The client can optionally set the language and prompt for transcription, these - offer additional guidance to the transcription service. - """ - - turn_detection: Optional[TurnDetection] = None - """Configuration for turn detection. + include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None + """Additional fields to include in server outputs. - Can be set to `null` to turn off. Server VAD means that the model will detect - the start and end of speech based on audio volume and respond at the end of user - speech. + `item.input_audio_transcription.logprobs`: Include logprobs for input audio + transcription. """ diff --git a/src/openai/types/realtime/realtime_transcription_session_create_request_param.py b/src/openai/types/realtime/realtime_transcription_session_create_request_param.py index 405f0c5f2c..80cbe2d414 100644 --- a/src/openai/types/realtime/realtime_transcription_session_create_request_param.py +++ b/src/openai/types/realtime/realtime_transcription_session_create_request_param.py @@ -2,127 +2,27 @@ from __future__ import annotations -from typing import List, Union +from typing import List from typing_extensions import Literal, Required, TypedDict -__all__ = [ - "RealtimeTranscriptionSessionCreateRequestParam", - "InputAudioNoiseReduction", - "InputAudioTranscription", - "TurnDetection", -] +from .realtime_transcription_session_audio_param import RealtimeTranscriptionSessionAudioParam - -class InputAudioNoiseReduction(TypedDict, total=False): - type: Literal["near_field", "far_field"] - """Type of noise reduction. - - `near_field` is for close-talking microphones such as headphones, `far_field` is - for far-field microphones such as laptop or conference room microphones. - """ - - -class InputAudioTranscription(TypedDict, total=False): - language: str - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"] - """ - The model to use for transcription, current options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1`. - """ - - prompt: str - """ - An optional text to guide the model's style or continue a previous audio - segment. For `whisper-1`, the - [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example - "expect words related to technology". - """ - - -class TurnDetection(TypedDict, total=False): - prefix_padding_ms: int - """Amount of audio to include before the VAD detected speech (in milliseconds). - - Defaults to 300ms. - """ - - silence_duration_ms: int - """Duration of silence to detect speech stop (in milliseconds). - - Defaults to 500ms. With shorter values the model will respond more quickly, but - may jump in on short pauses from the user. - """ - - threshold: float - """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. - - A higher threshold will require louder audio to activate the model, and thus - might perform better in noisy environments. - """ - - type: Literal["server_vad"] - """Type of turn detection. - - Only `server_vad` is currently supported for transcription sessions. - """ +__all__ = ["RealtimeTranscriptionSessionCreateRequestParam"] class RealtimeTranscriptionSessionCreateRequestParam(TypedDict, total=False): - model: Required[Union[str, Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]]] - """ID of the model to use. - - The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` - (which is powered by our open source Whisper V2 model). - """ - type: Required[Literal["transcription"]] """The type of session to create. Always `transcription` for transcription sessions. """ - include: List[Literal["item.input_audio_transcription.logprobs"]] - """The set of items to include in the transcription. Current available items are: - - - `item.input_audio_transcription.logprobs` - """ + audio: RealtimeTranscriptionSessionAudioParam + """Configuration for input and output audio.""" - input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] - """The format of input audio. - - Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must - be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian - byte order. - """ - - input_audio_noise_reduction: InputAudioNoiseReduction - """Configuration for input audio noise reduction. - - This can be set to `null` to turn off. Noise reduction filters audio added to - the input audio buffer before it is sent to VAD and the model. Filtering the - audio can improve VAD and turn detection accuracy (reducing false positives) and - model performance by improving perception of the input audio. - """ - - input_audio_transcription: InputAudioTranscription - """Configuration for input audio transcription. - - The client can optionally set the language and prompt for transcription, these - offer additional guidance to the transcription service. - """ - - turn_detection: TurnDetection - """Configuration for turn detection. + include: List[Literal["item.input_audio_transcription.logprobs"]] + """Additional fields to include in server outputs. - Can be set to `null` to turn off. Server VAD means that the model will detect - the start and end of speech based on audio volume and respond at the end of user - speech. + `item.input_audio_transcription.logprobs`: Include logprobs for input audio + transcription. """ diff --git a/src/openai/types/realtime/realtime_transcription_session_create_response.py b/src/openai/types/realtime/realtime_transcription_session_create_response.py new file mode 100644 index 0000000000..a08538aa8f --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_create_response.py @@ -0,0 +1,41 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Optional +from typing_extensions import Literal + +from ..._models import BaseModel +from .realtime_transcription_session_client_secret import RealtimeTranscriptionSessionClientSecret +from .realtime_transcription_session_turn_detection import RealtimeTranscriptionSessionTurnDetection +from .realtime_transcription_session_input_audio_transcription import ( + RealtimeTranscriptionSessionInputAudioTranscription, +) + +__all__ = ["RealtimeTranscriptionSessionCreateResponse"] + + +class RealtimeTranscriptionSessionCreateResponse(BaseModel): + client_secret: RealtimeTranscriptionSessionClientSecret + """Ephemeral key returned by the API. + + Only present when the session is created on the server via REST API. + """ + + input_audio_format: Optional[str] = None + """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" + + input_audio_transcription: Optional[RealtimeTranscriptionSessionInputAudioTranscription] = None + """Configuration of the transcription model.""" + + modalities: Optional[List[Literal["text", "audio"]]] = None + """The set of modalities the model can respond with. + + To disable audio, set this to ["text"]. + """ + + turn_detection: Optional[RealtimeTranscriptionSessionTurnDetection] = None + """Configuration for turn detection. + + Can be set to `null` to turn off. Server VAD means that the model will detect + the start and end of speech based on audio volume and respond at the end of user + speech. + """ diff --git a/src/openai/types/realtime/realtime_transcription_session_input_audio_transcription.py b/src/openai/types/realtime/realtime_transcription_session_input_audio_transcription.py new file mode 100644 index 0000000000..52254bed33 --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_input_audio_transcription.py @@ -0,0 +1,36 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["RealtimeTranscriptionSessionInputAudioTranscription"] + + +class RealtimeTranscriptionSessionInputAudioTranscription(BaseModel): + language: Optional[str] = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + """ + + model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = ( + None + ) + """The model to use for transcription. + + Current options are `whisper-1`, `gpt-4o-transcribe-latest`, + `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + """ + + prompt: Optional[str] = None + """ + An optional text to guide the model's style or continue a previous audio + segment. For `whisper-1`, the + [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example + "expect words related to technology". + """ diff --git a/src/openai/types/realtime/realtime_transcription_session_turn_detection.py b/src/openai/types/realtime/realtime_transcription_session_turn_detection.py new file mode 100644 index 0000000000..f5da31ce77 --- /dev/null +++ b/src/openai/types/realtime/realtime_transcription_session_turn_detection.py @@ -0,0 +1,32 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import Optional + +from ..._models import BaseModel + +__all__ = ["RealtimeTranscriptionSessionTurnDetection"] + + +class RealtimeTranscriptionSessionTurnDetection(BaseModel): + prefix_padding_ms: Optional[int] = None + """Amount of audio to include before the VAD detected speech (in milliseconds). + + Defaults to 300ms. + """ + + silence_duration_ms: Optional[int] = None + """Duration of silence to detect speech stop (in milliseconds). + + Defaults to 500ms. With shorter values the model will respond more quickly, but + may jump in on short pauses from the user. + """ + + threshold: Optional[float] = None + """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + + A higher threshold will require louder audio to activate the model, and thus + might perform better in noisy environments. + """ + + type: Optional[str] = None + """Type of turn detection, only `server_vad` is currently supported.""" diff --git a/src/openai/types/realtime/realtime_truncation.py b/src/openai/types/realtime/realtime_truncation.py index 4687e3da56..515f869071 100644 --- a/src/openai/types/realtime/realtime_truncation.py +++ b/src/openai/types/realtime/realtime_truncation.py @@ -1,22 +1,10 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Union, Optional +from typing import Union from typing_extensions import Literal, TypeAlias -from ..._models import BaseModel +from .realtime_truncation_retention_ratio import RealtimeTruncationRetentionRatio -__all__ = ["RealtimeTruncation", "RetentionRatioTruncation"] +__all__ = ["RealtimeTruncation"] - -class RetentionRatioTruncation(BaseModel): - retention_ratio: float - """Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0).""" - - type: Literal["retention_ratio"] - """Use retention ratio truncation.""" - - post_instructions_token_limit: Optional[int] = None - """Optional cap on tokens allowed after the instructions.""" - - -RealtimeTruncation: TypeAlias = Union[Literal["auto", "disabled"], RetentionRatioTruncation] +RealtimeTruncation: TypeAlias = Union[Literal["auto", "disabled"], RealtimeTruncationRetentionRatio] diff --git a/src/openai/types/realtime/realtime_truncation_param.py b/src/openai/types/realtime/realtime_truncation_param.py index edc88ea685..5e42b27418 100644 --- a/src/openai/types/realtime/realtime_truncation_param.py +++ b/src/openai/types/realtime/realtime_truncation_param.py @@ -2,21 +2,11 @@ from __future__ import annotations -from typing import Union, Optional -from typing_extensions import Literal, Required, TypeAlias, TypedDict +from typing import Union +from typing_extensions import Literal, TypeAlias -__all__ = ["RealtimeTruncationParam", "RetentionRatioTruncation"] +from .realtime_truncation_retention_ratio_param import RealtimeTruncationRetentionRatioParam +__all__ = ["RealtimeTruncationParam"] -class RetentionRatioTruncation(TypedDict, total=False): - retention_ratio: Required[float] - """Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0).""" - - type: Required[Literal["retention_ratio"]] - """Use retention ratio truncation.""" - - post_instructions_token_limit: Optional[int] - """Optional cap on tokens allowed after the instructions.""" - - -RealtimeTruncationParam: TypeAlias = Union[Literal["auto", "disabled"], RetentionRatioTruncation] +RealtimeTruncationParam: TypeAlias = Union[Literal["auto", "disabled"], RealtimeTruncationRetentionRatioParam] diff --git a/src/openai/types/realtime/realtime_truncation_retention_ratio.py b/src/openai/types/realtime/realtime_truncation_retention_ratio.py new file mode 100644 index 0000000000..b40427244e --- /dev/null +++ b/src/openai/types/realtime/realtime_truncation_retention_ratio.py @@ -0,0 +1,18 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["RealtimeTruncationRetentionRatio"] + + +class RealtimeTruncationRetentionRatio(BaseModel): + retention_ratio: float + """ + Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the + conversation exceeds the input token limit. + """ + + type: Literal["retention_ratio"] + """Use retention ratio truncation.""" diff --git a/src/openai/types/realtime/realtime_truncation_retention_ratio_param.py b/src/openai/types/realtime/realtime_truncation_retention_ratio_param.py new file mode 100644 index 0000000000..b65d65666a --- /dev/null +++ b/src/openai/types/realtime/realtime_truncation_retention_ratio_param.py @@ -0,0 +1,18 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from __future__ import annotations + +from typing_extensions import Literal, Required, TypedDict + +__all__ = ["RealtimeTruncationRetentionRatioParam"] + + +class RealtimeTruncationRetentionRatioParam(TypedDict, total=False): + retention_ratio: Required[float] + """ + Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the + conversation exceeds the input token limit. + """ + + type: Required[Literal["retention_ratio"]] + """Use retention ratio truncation.""" diff --git a/src/openai/types/realtime/response_create_event.py b/src/openai/types/realtime/response_create_event.py index a37045eab1..75a08ee460 100644 --- a/src/openai/types/realtime/response_create_event.py +++ b/src/openai/types/realtime/response_create_event.py @@ -1,126 +1,12 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Union, Optional -from typing_extensions import Literal, TypeAlias +from typing import Optional +from typing_extensions import Literal from ..._models import BaseModel -from ..shared.metadata import Metadata -from .conversation_item import ConversationItem -from ..responses.response_prompt import ResponsePrompt -from ..responses.tool_choice_mcp import ToolChoiceMcp -from ..responses.tool_choice_options import ToolChoiceOptions -from ..responses.tool_choice_function import ToolChoiceFunction +from .realtime_response_create_params import RealtimeResponseCreateParams -__all__ = ["ResponseCreateEvent", "Response", "ResponseToolChoice", "ResponseTool"] - -ResponseToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunction, ToolChoiceMcp] - - -class ResponseTool(BaseModel): - description: Optional[str] = None - """ - The description of the function, including guidance on when and how to call it, - and guidance about what to tell the user when calling (if anything). - """ - - name: Optional[str] = None - """The name of the function.""" - - parameters: Optional[object] = None - """Parameters of the function in JSON Schema.""" - - type: Optional[Literal["function"]] = None - """The type of the tool, i.e. `function`.""" - - -class Response(BaseModel): - conversation: Union[str, Literal["auto", "none"], None] = None - """Controls which conversation the response is added to. - - Currently supports `auto` and `none`, with `auto` as the default value. The - `auto` value means that the contents of the response will be added to the - default conversation. Set this to `none` to create an out-of-band response which - will not add items to default conversation. - """ - - input: Optional[List[ConversationItem]] = None - """Input items to include in the prompt for the model. - - Using this field creates a new context for this Response instead of using the - default conversation. An empty array `[]` will clear the context for this - Response. Note that this can include references to items from the default - conversation. - """ - - instructions: Optional[str] = None - """The default system instructions (i.e. - - system message) prepended to model calls. This field allows the client to guide - the model on desired responses. The model can be instructed on response content - and format, (e.g. "be extremely succinct", "act friendly", "here are examples of - good responses") and on audio behavior (e.g. "talk quickly", "inject emotion - into your voice", "laugh frequently"). The instructions are not guaranteed to be - followed by the model, but they provide guidance to the model on the desired - behavior. - - Note that the server sets default instructions which will be used if this field - is not set and are visible in the `session.created` event at the start of the - session. - """ - - max_output_tokens: Union[int, Literal["inf"], None] = None - """ - Maximum number of output tokens for a single assistant response, inclusive of - tool calls. Provide an integer between 1 and 4096 to limit output tokens, or - `inf` for the maximum available tokens for a given model. Defaults to `inf`. - """ - - metadata: Optional[Metadata] = None - """Set of 16 key-value pairs that can be attached to an object. - - This can be useful for storing additional information about the object in a - structured format, and querying for objects via API or the dashboard. - - Keys are strings with a maximum length of 64 characters. Values are strings with - a maximum length of 512 characters. - """ - - modalities: Optional[List[Literal["text", "audio"]]] = None - """The set of modalities the model can respond with. - - To disable audio, set this to ["text"]. - """ - - output_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None - """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" - - prompt: Optional[ResponsePrompt] = None - """Reference to a prompt template and its variables. - - [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). - """ - - temperature: Optional[float] = None - """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" - - tool_choice: Optional[ResponseToolChoice] = None - """How the model chooses tools. - - Provide one of the string modes or force a specific function/MCP tool. - """ - - tools: Optional[List[ResponseTool]] = None - """Tools (functions) available to the model.""" - - voice: Union[ - str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None - ] = None - """The voice the model uses to respond. - - Voice cannot be changed during the session once the model has responded with - audio at least once. Current voice options are `alloy`, `ash`, `ballad`, - `coral`, `echo`, `sage`, `shimmer`, and `verse`. - """ +__all__ = ["ResponseCreateEvent"] class ResponseCreateEvent(BaseModel): @@ -130,5 +16,5 @@ class ResponseCreateEvent(BaseModel): event_id: Optional[str] = None """Optional client-generated ID used to identify this event.""" - response: Optional[Response] = None + response: Optional[RealtimeResponseCreateParams] = None """Create a new Realtime response with these parameters""" diff --git a/src/openai/types/realtime/response_create_event_param.py b/src/openai/types/realtime/response_create_event_param.py index f941c4ca9c..e5dd46d9b6 100644 --- a/src/openai/types/realtime/response_create_event_param.py +++ b/src/openai/types/realtime/response_create_event_param.py @@ -2,124 +2,11 @@ from __future__ import annotations -from typing import List, Union, Iterable, Optional -from typing_extensions import Literal, Required, TypeAlias, TypedDict +from typing_extensions import Literal, Required, TypedDict -from ..shared_params.metadata import Metadata -from .conversation_item_param import ConversationItemParam -from ..responses.tool_choice_options import ToolChoiceOptions -from ..responses.response_prompt_param import ResponsePromptParam -from ..responses.tool_choice_mcp_param import ToolChoiceMcpParam -from ..responses.tool_choice_function_param import ToolChoiceFunctionParam +from .realtime_response_create_params_param import RealtimeResponseCreateParamsParam -__all__ = ["ResponseCreateEventParam", "Response", "ResponseToolChoice", "ResponseTool"] - -ResponseToolChoice: TypeAlias = Union[ToolChoiceOptions, ToolChoiceFunctionParam, ToolChoiceMcpParam] - - -class ResponseTool(TypedDict, total=False): - description: str - """ - The description of the function, including guidance on when and how to call it, - and guidance about what to tell the user when calling (if anything). - """ - - name: str - """The name of the function.""" - - parameters: object - """Parameters of the function in JSON Schema.""" - - type: Literal["function"] - """The type of the tool, i.e. `function`.""" - - -class Response(TypedDict, total=False): - conversation: Union[str, Literal["auto", "none"]] - """Controls which conversation the response is added to. - - Currently supports `auto` and `none`, with `auto` as the default value. The - `auto` value means that the contents of the response will be added to the - default conversation. Set this to `none` to create an out-of-band response which - will not add items to default conversation. - """ - - input: Iterable[ConversationItemParam] - """Input items to include in the prompt for the model. - - Using this field creates a new context for this Response instead of using the - default conversation. An empty array `[]` will clear the context for this - Response. Note that this can include references to items from the default - conversation. - """ - - instructions: str - """The default system instructions (i.e. - - system message) prepended to model calls. This field allows the client to guide - the model on desired responses. The model can be instructed on response content - and format, (e.g. "be extremely succinct", "act friendly", "here are examples of - good responses") and on audio behavior (e.g. "talk quickly", "inject emotion - into your voice", "laugh frequently"). The instructions are not guaranteed to be - followed by the model, but they provide guidance to the model on the desired - behavior. - - Note that the server sets default instructions which will be used if this field - is not set and are visible in the `session.created` event at the start of the - session. - """ - - max_output_tokens: Union[int, Literal["inf"]] - """ - Maximum number of output tokens for a single assistant response, inclusive of - tool calls. Provide an integer between 1 and 4096 to limit output tokens, or - `inf` for the maximum available tokens for a given model. Defaults to `inf`. - """ - - metadata: Optional[Metadata] - """Set of 16 key-value pairs that can be attached to an object. - - This can be useful for storing additional information about the object in a - structured format, and querying for objects via API or the dashboard. - - Keys are strings with a maximum length of 64 characters. Values are strings with - a maximum length of 512 characters. - """ - - modalities: List[Literal["text", "audio"]] - """The set of modalities the model can respond with. - - To disable audio, set this to ["text"]. - """ - - output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] - """The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" - - prompt: Optional[ResponsePromptParam] - """Reference to a prompt template and its variables. - - [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). - """ - - temperature: float - """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" - - tool_choice: ResponseToolChoice - """How the model chooses tools. - - Provide one of the string modes or force a specific function/MCP tool. - """ - - tools: Iterable[ResponseTool] - """Tools (functions) available to the model.""" - - voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]] - """The voice the model uses to respond. - - Voice cannot be changed during the session once the model has responded with - audio at least once. Current voice options are `alloy`, `ash`, `ballad`, - `coral`, `echo`, `sage`, `shimmer`, and `verse`. - """ +__all__ = ["ResponseCreateEventParam"] class ResponseCreateEventParam(TypedDict, total=False): @@ -129,5 +16,5 @@ class ResponseCreateEventParam(TypedDict, total=False): event_id: str """Optional client-generated ID used to identify this event.""" - response: Response + response: RealtimeResponseCreateParamsParam """Create a new Realtime response with these parameters""" diff --git a/src/openai/types/realtime/session_created_event.py b/src/openai/types/realtime/session_created_event.py index 51f75700f0..b5caad35d7 100644 --- a/src/openai/types/realtime/session_created_event.py +++ b/src/openai/types/realtime/session_created_event.py @@ -1,19 +1,23 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing_extensions import Literal +from typing import Union +from typing_extensions import Literal, TypeAlias from ..._models import BaseModel -from .realtime_session import RealtimeSession +from .realtime_session_create_request import RealtimeSessionCreateRequest +from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest -__all__ = ["SessionCreatedEvent"] +__all__ = ["SessionCreatedEvent", "Session"] + +Session: TypeAlias = Union[RealtimeSessionCreateRequest, RealtimeTranscriptionSessionCreateRequest] class SessionCreatedEvent(BaseModel): event_id: str """The unique ID of the server event.""" - session: RealtimeSession - """Realtime session object.""" + session: Session + """The session configuration.""" type: Literal["session.created"] """The event type, must be `session.created`.""" diff --git a/src/openai/types/realtime/session_update_event.py b/src/openai/types/realtime/session_update_event.py index 00a4377f96..2e226162c4 100644 --- a/src/openai/types/realtime/session_update_event.py +++ b/src/openai/types/realtime/session_update_event.py @@ -1,20 +1,31 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Optional -from typing_extensions import Literal +from typing import Union, Optional +from typing_extensions import Literal, TypeAlias from ..._models import BaseModel from .realtime_session_create_request import RealtimeSessionCreateRequest +from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest -__all__ = ["SessionUpdateEvent"] +__all__ = ["SessionUpdateEvent", "Session"] + +Session: TypeAlias = Union[RealtimeSessionCreateRequest, RealtimeTranscriptionSessionCreateRequest] class SessionUpdateEvent(BaseModel): - session: RealtimeSessionCreateRequest - """Realtime session object configuration.""" + session: Session + """Update the Realtime session. + + Choose either a realtime session or a transcription session. + """ type: Literal["session.update"] """The event type, must be `session.update`.""" event_id: Optional[str] = None - """Optional client-generated ID used to identify this event.""" + """Optional client-generated ID used to identify this event. + + This is an arbitrary string that a client may assign. It will be passed back if + there is an error with the event, but the corresponding `session.updated` event + will not include it. + """ diff --git a/src/openai/types/realtime/session_update_event_param.py b/src/openai/types/realtime/session_update_event_param.py index 79ff05f729..5962361431 100644 --- a/src/openai/types/realtime/session_update_event_param.py +++ b/src/openai/types/realtime/session_update_event_param.py @@ -2,19 +2,31 @@ from __future__ import annotations -from typing_extensions import Literal, Required, TypedDict +from typing import Union +from typing_extensions import Literal, Required, TypeAlias, TypedDict from .realtime_session_create_request_param import RealtimeSessionCreateRequestParam +from .realtime_transcription_session_create_request_param import RealtimeTranscriptionSessionCreateRequestParam -__all__ = ["SessionUpdateEventParam"] +__all__ = ["SessionUpdateEventParam", "Session"] + +Session: TypeAlias = Union[RealtimeSessionCreateRequestParam, RealtimeTranscriptionSessionCreateRequestParam] class SessionUpdateEventParam(TypedDict, total=False): - session: Required[RealtimeSessionCreateRequestParam] - """Realtime session object configuration.""" + session: Required[Session] + """Update the Realtime session. + + Choose either a realtime session or a transcription session. + """ type: Required[Literal["session.update"]] """The event type, must be `session.update`.""" event_id: str - """Optional client-generated ID used to identify this event.""" + """Optional client-generated ID used to identify this event. + + This is an arbitrary string that a client may assign. It will be passed back if + there is an error with the event, but the corresponding `session.updated` event + will not include it. + """ diff --git a/src/openai/types/realtime/session_updated_event.py b/src/openai/types/realtime/session_updated_event.py index b8a5972f6e..eb7ee0332d 100644 --- a/src/openai/types/realtime/session_updated_event.py +++ b/src/openai/types/realtime/session_updated_event.py @@ -1,19 +1,23 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing_extensions import Literal +from typing import Union +from typing_extensions import Literal, TypeAlias from ..._models import BaseModel -from .realtime_session import RealtimeSession +from .realtime_session_create_request import RealtimeSessionCreateRequest +from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest -__all__ = ["SessionUpdatedEvent"] +__all__ = ["SessionUpdatedEvent", "Session"] + +Session: TypeAlias = Union[RealtimeSessionCreateRequest, RealtimeTranscriptionSessionCreateRequest] class SessionUpdatedEvent(BaseModel): event_id: str """The unique ID of the server event.""" - session: RealtimeSession - """Realtime session object.""" + session: Session + """The session configuration.""" type: Literal["session.updated"] """The event type, must be `session.updated`.""" diff --git a/src/openai/types/realtime/transcription_session_created.py b/src/openai/types/realtime/transcription_session_created.py index 1d34d152d7..c358c5e8b0 100644 --- a/src/openai/types/realtime/transcription_session_created.py +++ b/src/openai/types/realtime/transcription_session_created.py @@ -1,105 +1,24 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Optional from typing_extensions import Literal from ..._models import BaseModel +from .realtime_transcription_session_create_response import RealtimeTranscriptionSessionCreateResponse -__all__ = [ - "TranscriptionSessionCreated", - "Session", - "SessionAudio", - "SessionAudioInput", - "SessionAudioInputNoiseReduction", - "SessionAudioInputTranscription", - "SessionAudioInputTurnDetection", -] - - -class SessionAudioInputNoiseReduction(BaseModel): - type: Optional[Literal["near_field", "far_field"]] = None - - -class SessionAudioInputTranscription(BaseModel): - language: Optional[str] = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None - """The model to use for transcription. - - Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`. - """ - - prompt: Optional[str] = None - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. - """ - - -class SessionAudioInputTurnDetection(BaseModel): - prefix_padding_ms: Optional[int] = None - - silence_duration_ms: Optional[int] = None - - threshold: Optional[float] = None - - type: Optional[str] = None - """Type of turn detection, only `server_vad` is currently supported.""" - - -class SessionAudioInput(BaseModel): - format: Optional[str] = None - """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" - - noise_reduction: Optional[SessionAudioInputNoiseReduction] = None - """Configuration for input audio noise reduction.""" - - transcription: Optional[SessionAudioInputTranscription] = None - """Configuration of the transcription model.""" - - turn_detection: Optional[SessionAudioInputTurnDetection] = None - """Configuration for turn detection.""" - - -class SessionAudio(BaseModel): - input: Optional[SessionAudioInput] = None - - -class Session(BaseModel): - id: Optional[str] = None - """Unique identifier for the session that looks like `sess_1234567890abcdef`.""" - - audio: Optional[SessionAudio] = None - """Configuration for input audio for the session.""" - - expires_at: Optional[int] = None - """Expiration timestamp for the session, in seconds since epoch.""" - - include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None - """Additional fields to include in server outputs. - - - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - transcription. - """ - - object: Optional[str] = None - """The object type. Always `realtime.transcription_session`.""" +__all__ = ["TranscriptionSessionCreated"] class TranscriptionSessionCreated(BaseModel): event_id: str """The unique ID of the server event.""" - session: Session - """A Realtime transcription session configuration object.""" + session: RealtimeTranscriptionSessionCreateResponse + """A new Realtime transcription session configuration. + + When a session is created on the server via REST API, the session object also + contains an ephemeral key. Default TTL for keys is 10 minutes. This property is + not present when a session is updated via the WebSocket API. + """ type: Literal["transcription_session.created"] """The event type, must be `transcription_session.created`.""" diff --git a/src/openai/types/realtime/transcription_session_update.py b/src/openai/types/realtime/transcription_session_update.py index c8f5b9eb4a..0faff9cb57 100644 --- a/src/openai/types/realtime/transcription_session_update.py +++ b/src/openai/types/realtime/transcription_session_update.py @@ -1,16 +1,94 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Optional +from typing import List, Optional from typing_extensions import Literal from ..._models import BaseModel -from .realtime_transcription_session_create_request import RealtimeTranscriptionSessionCreateRequest +from .audio_transcription import AudioTranscription +from .noise_reduction_type import NoiseReductionType -__all__ = ["TranscriptionSessionUpdate"] +__all__ = ["TranscriptionSessionUpdate", "Session", "SessionInputAudioNoiseReduction", "SessionTurnDetection"] + + +class SessionInputAudioNoiseReduction(BaseModel): + type: Optional[NoiseReductionType] = None + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class SessionTurnDetection(BaseModel): + prefix_padding_ms: Optional[int] = None + """Amount of audio to include before the VAD detected speech (in milliseconds). + + Defaults to 300ms. + """ + + silence_duration_ms: Optional[int] = None + """Duration of silence to detect speech stop (in milliseconds). + + Defaults to 500ms. With shorter values the model will respond more quickly, but + may jump in on short pauses from the user. + """ + + threshold: Optional[float] = None + """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + + A higher threshold will require louder audio to activate the model, and thus + might perform better in noisy environments. + """ + + type: Optional[Literal["server_vad"]] = None + """Type of turn detection. + + Only `server_vad` is currently supported for transcription sessions. + """ + + +class Session(BaseModel): + include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None + """The set of items to include in the transcription. + + Current available items are: `item.input_audio_transcription.logprobs` + """ + + input_audio_format: Optional[Literal["pcm16", "g711_ulaw", "g711_alaw"]] = None + """The format of input audio. + + Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must + be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian + byte order. + """ + + input_audio_noise_reduction: Optional[SessionInputAudioNoiseReduction] = None + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + input_audio_transcription: Optional[AudioTranscription] = None + """Configuration for input audio transcription. + + The client can optionally set the language and prompt for transcription, these + offer additional guidance to the transcription service. + """ + + turn_detection: Optional[SessionTurnDetection] = None + """Configuration for turn detection. + + Can be set to `null` to turn off. Server VAD means that the model will detect + the start and end of speech based on audio volume and respond at the end of user + speech. + """ class TranscriptionSessionUpdate(BaseModel): - session: RealtimeTranscriptionSessionCreateRequest + session: Session """Realtime transcription session object configuration.""" type: Literal["transcription_session.update"] diff --git a/src/openai/types/realtime/transcription_session_update_param.py b/src/openai/types/realtime/transcription_session_update_param.py index f2e66efaa0..55c67798b6 100644 --- a/src/openai/types/realtime/transcription_session_update_param.py +++ b/src/openai/types/realtime/transcription_session_update_param.py @@ -2,15 +2,94 @@ from __future__ import annotations +from typing import List from typing_extensions import Literal, Required, TypedDict -from .realtime_transcription_session_create_request_param import RealtimeTranscriptionSessionCreateRequestParam +from .noise_reduction_type import NoiseReductionType +from .audio_transcription_param import AudioTranscriptionParam -__all__ = ["TranscriptionSessionUpdateParam"] +__all__ = ["TranscriptionSessionUpdateParam", "Session", "SessionInputAudioNoiseReduction", "SessionTurnDetection"] + + +class SessionInputAudioNoiseReduction(TypedDict, total=False): + type: NoiseReductionType + """Type of noise reduction. + + `near_field` is for close-talking microphones such as headphones, `far_field` is + for far-field microphones such as laptop or conference room microphones. + """ + + +class SessionTurnDetection(TypedDict, total=False): + prefix_padding_ms: int + """Amount of audio to include before the VAD detected speech (in milliseconds). + + Defaults to 300ms. + """ + + silence_duration_ms: int + """Duration of silence to detect speech stop (in milliseconds). + + Defaults to 500ms. With shorter values the model will respond more quickly, but + may jump in on short pauses from the user. + """ + + threshold: float + """Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. + + A higher threshold will require louder audio to activate the model, and thus + might perform better in noisy environments. + """ + + type: Literal["server_vad"] + """Type of turn detection. + + Only `server_vad` is currently supported for transcription sessions. + """ + + +class Session(TypedDict, total=False): + include: List[Literal["item.input_audio_transcription.logprobs"]] + """The set of items to include in the transcription. + + Current available items are: `item.input_audio_transcription.logprobs` + """ + + input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] + """The format of input audio. + + Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For `pcm16`, input audio must + be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian + byte order. + """ + + input_audio_noise_reduction: SessionInputAudioNoiseReduction + """Configuration for input audio noise reduction. + + This can be set to `null` to turn off. Noise reduction filters audio added to + the input audio buffer before it is sent to VAD and the model. Filtering the + audio can improve VAD and turn detection accuracy (reducing false positives) and + model performance by improving perception of the input audio. + """ + + input_audio_transcription: AudioTranscriptionParam + """Configuration for input audio transcription. + + The client can optionally set the language and prompt for transcription, these + offer additional guidance to the transcription service. + """ + + turn_detection: SessionTurnDetection + """Configuration for turn detection. + + Can be set to `null` to turn off. Server VAD means that the model will detect + the start and end of speech based on audio volume and respond at the end of user + speech. + """ class TranscriptionSessionUpdateParam(TypedDict, total=False): - session: Required[RealtimeTranscriptionSessionCreateRequestParam] + session: Required[Session] """Realtime transcription session object configuration.""" type: Required[Literal["transcription_session.update"]] diff --git a/src/openai/types/realtime/transcription_session_updated_event.py b/src/openai/types/realtime/transcription_session_updated_event.py index 9abd1d20be..f6a52a12f3 100644 --- a/src/openai/types/realtime/transcription_session_updated_event.py +++ b/src/openai/types/realtime/transcription_session_updated_event.py @@ -1,105 +1,24 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Optional from typing_extensions import Literal from ..._models import BaseModel +from .realtime_transcription_session_create_response import RealtimeTranscriptionSessionCreateResponse -__all__ = [ - "TranscriptionSessionUpdatedEvent", - "Session", - "SessionAudio", - "SessionAudioInput", - "SessionAudioInputNoiseReduction", - "SessionAudioInputTranscription", - "SessionAudioInputTurnDetection", -] - - -class SessionAudioInputNoiseReduction(BaseModel): - type: Optional[Literal["near_field", "far_field"]] = None - - -class SessionAudioInputTranscription(BaseModel): - language: Optional[str] = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - """ - - model: Optional[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"]] = None - """The model to use for transcription. - - Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`. - """ - - prompt: Optional[str] = None - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. - """ - - -class SessionAudioInputTurnDetection(BaseModel): - prefix_padding_ms: Optional[int] = None - - silence_duration_ms: Optional[int] = None - - threshold: Optional[float] = None - - type: Optional[str] = None - """Type of turn detection, only `server_vad` is currently supported.""" - - -class SessionAudioInput(BaseModel): - format: Optional[str] = None - """The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.""" - - noise_reduction: Optional[SessionAudioInputNoiseReduction] = None - """Configuration for input audio noise reduction.""" - - transcription: Optional[SessionAudioInputTranscription] = None - """Configuration of the transcription model.""" - - turn_detection: Optional[SessionAudioInputTurnDetection] = None - """Configuration for turn detection.""" - - -class SessionAudio(BaseModel): - input: Optional[SessionAudioInput] = None - - -class Session(BaseModel): - id: Optional[str] = None - """Unique identifier for the session that looks like `sess_1234567890abcdef`.""" - - audio: Optional[SessionAudio] = None - """Configuration for input audio for the session.""" - - expires_at: Optional[int] = None - """Expiration timestamp for the session, in seconds since epoch.""" - - include: Optional[List[Literal["item.input_audio_transcription.logprobs"]]] = None - """Additional fields to include in server outputs. - - - `item.input_audio_transcription.logprobs`: Include logprobs for input audio - transcription. - """ - - object: Optional[str] = None - """The object type. Always `realtime.transcription_session`.""" +__all__ = ["TranscriptionSessionUpdatedEvent"] class TranscriptionSessionUpdatedEvent(BaseModel): event_id: str """The unique ID of the server event.""" - session: Session - """A Realtime transcription session configuration object.""" + session: RealtimeTranscriptionSessionCreateResponse + """A new Realtime transcription session configuration. + + When a session is created on the server via REST API, the session object also + contains an ephemeral key. Default TTL for keys is 10 minutes. This property is + not present when a session is updated via the WebSocket API. + """ type: Literal["transcription_session.updated"] """The event type, must be `transcription_session.updated`.""" diff --git a/tests/api_resources/realtime/test_client_secrets.py b/tests/api_resources/realtime/test_client_secrets.py index c477268ee6..b7bb0e5aa7 100644 --- a/tests/api_resources/realtime/test_client_secrets.py +++ b/tests/api_resources/realtime/test_client_secrets.py @@ -30,11 +30,13 @@ def test_method_create_with_all_params(self, client: OpenAI) -> None: "seconds": 10, }, session={ - "model": "string", "type": "realtime", "audio": { "input": { - "format": "pcm16", + "format": { + "rate": 24000, + "type": "audio/pcm", + }, "noise_reduction": {"type": "near_field"}, "transcription": { "language": "language", @@ -53,27 +55,24 @@ def test_method_create_with_all_params(self, client: OpenAI) -> None: }, }, "output": { - "format": "pcm16", + "format": { + "rate": 24000, + "type": "audio/pcm", + }, "speed": 0.25, "voice": "ash", }, }, - "client_secret": { - "expires_after": { - "anchor": "created_at", - "seconds": 0, - } - }, "include": ["item.input_audio_transcription.logprobs"], "instructions": "instructions", "max_output_tokens": 0, + "model": "string", "output_modalities": ["text"], "prompt": { "id": "id", "variables": {"foo": "string"}, "version": "version", }, - "temperature": 0, "tool_choice": "none", "tools": [ { @@ -128,11 +127,13 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> "seconds": 10, }, session={ - "model": "string", "type": "realtime", "audio": { "input": { - "format": "pcm16", + "format": { + "rate": 24000, + "type": "audio/pcm", + }, "noise_reduction": {"type": "near_field"}, "transcription": { "language": "language", @@ -151,27 +152,24 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> }, }, "output": { - "format": "pcm16", + "format": { + "rate": 24000, + "type": "audio/pcm", + }, "speed": 0.25, "voice": "ash", }, }, - "client_secret": { - "expires_after": { - "anchor": "created_at", - "seconds": 0, - } - }, "include": ["item.input_audio_transcription.logprobs"], "instructions": "instructions", "max_output_tokens": 0, + "model": "string", "output_modalities": ["text"], "prompt": { "id": "id", "variables": {"foo": "string"}, "version": "version", }, - "temperature": 0, "tool_choice": "none", "tools": [ {