Quellcode durchsuchen

[Vertex AI] Add prototype for testing Multimodal Live API

Andrew Heard vor 1 Jahr
Ursprung
Commit
af62480c10

+ 5 - 1
FirebaseVertexAI/Sources/GenerationConfig.swift

@@ -48,6 +48,8 @@ public struct GenerationConfig {
   /// Output schema of the generated candidate text.
   let responseSchema: Schema?
 
+  let responseModalities: [String]?
+
   /// Creates a new `GenerationConfig` value.
   ///
   /// See the
@@ -143,7 +145,7 @@ public struct GenerationConfig {
               candidateCount: Int? = nil, maxOutputTokens: Int? = nil,
               presencePenalty: Float? = nil, frequencyPenalty: Float? = nil,
               stopSequences: [String]? = nil, responseMIMEType: String? = nil,
-              responseSchema: Schema? = nil) {
+              responseSchema: Schema? = nil, responseModalities: [String]? = nil) {
     // Explicit init because otherwise if we re-arrange the above variables it changes the API
     // surface.
     self.temperature = temperature
@@ -156,6 +158,7 @@ public struct GenerationConfig {
     self.stopSequences = stopSequences
     self.responseMIMEType = responseMIMEType
     self.responseSchema = responseSchema
+    self.responseModalities = responseModalities
   }
 }
 
@@ -174,5 +177,6 @@ extension GenerationConfig: Encodable {
     case stopSequences
     case responseMIMEType = "responseMimeType"
     case responseSchema
+    case responseModalities
   }
 }

+ 7 - 0
FirebaseVertexAI/Sources/ModelContent.swift

@@ -112,6 +112,13 @@ extension ModelContent: Codable {
     case role
     case internalParts = "parts"
   }
+
+  public init(from decoder: any Decoder) throws {
+    let container = try decoder.container(keyedBy: CodingKeys.self)
+    role = try container.decodeIfPresent(String.self, forKey: .role)
+    internalParts =
+      try container.decodeIfPresent([ModelContent.InternalPart].self, forKey: .internalParts) ?? []
+  }
 }
 
 @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)

+ 36 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentClientContent.swift

@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Incremental update of the current conversation delivered from the client.
+// All the content here is unconditionally appended to the conversation
+// history and used as part of the prompt to the model to generate content.
+//
+// A message here will interrupt any current model generation.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+struct BidiGenerateContentClientContent {
+  // The content appended to the current conversation with the model.
+  //
+  // For single-turn queries, this is a single instance. For multi-turn
+  // queries, this is a repeated field that contains conversation history and
+  // latest request.
+  let turns: [ModelContent]?
+
+  // If true, indicates that the server content generation should start with
+  // the currently accumulated prompt. Otherwise, the server will await
+  // additional messages before starting generation.
+  let turnComplete: Bool?
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension BidiGenerateContentClientContent: Encodable {}

+ 53 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentClientMessage.swift

@@ -0,0 +1,53 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Messages sent by the client in the BidiGenerateContent RPC call.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+enum BidiGenerateContentClientMessage {
+  // Message to be sent in the first and only first client message.
+  case setup(BidiGenerateContentSetup)
+
+  // Incremental update of the current conversation delivered from the client.
+  case clientContent(BidiGenerateContentClientContent)
+
+  // User input that is sent in real time.
+  case realtimeInput(BidiGenerateContentRealtimeInput)
+
+  // Response to a `ToolCallMessage` received from the server.
+  case toolResponse(BidiGenerateContentToolResponse)
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension BidiGenerateContentClientMessage: Encodable {
+  enum CodingKeys: CodingKey {
+    case setup
+    case clientContent
+    case realtimeInput
+    case toolResponse
+  }
+
+  func encode(to encoder: any Encoder) throws {
+    var container = encoder.container(keyedBy: CodingKeys.self)
+    switch self {
+    case let .setup(setup):
+      try container.encode(setup, forKey: .setup)
+    case let .clientContent(content):
+      try container.encode(content, forKey: .clientContent)
+    case let .realtimeInput(realtimeInput):
+      try container.encode(realtimeInput, forKey: .realtimeInput)
+    case let .toolResponse(toolResponse):
+      try container.encode(toolResponse, forKey: .toolResponse)
+    }
+  }
+}

+ 36 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentRealtimeInput.swift

@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// User input that is sent in real time.
+//
+// This is different from `ClientContentUpdate` in a few ways:
+//
+// - Can be sent continuously without interruption to model generation.
+// - If there is a need to mix data interleaved across the
+//   `ClientContentUpdate` and the `RealtimeUpdate`, server attempts to
+//   optimize for best response, but there are no guarantees.
+// - End of turn is not explicitly specified, but is rather derived from user
+//   activity (for example, end of speech).
+// - Even before the end of turn, the data is processed incrementally
+//   to optimize for a fast start of the response from the model.
+// - Is always assumed to be the user's input (cannot be used to populate
+//   conversation history).
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+struct BidiGenerateContentRealtimeInput {
+  // Inlined bytes data for media input.
+  let mediaChunks: [InlineData]
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension BidiGenerateContentRealtimeInput: Encodable {}

+ 56 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentServerContent.swift

@@ -0,0 +1,56 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Incremental server update generated by the model in response to client
+// messages.
+//
+// Content is generated as quickly as possible, and not in realtime. Clients
+// may choose to buffer and play it out in realtime.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+struct BidiGenerateContentServerContent {
+  // The content that the model has generated as part of the current
+  // conversation with the user.
+  let modelTurn: ModelContent?
+
+  // If true, indicates that the model is done generating. Generation will only
+  // start in response to additional client messages. Can be set alongside
+  // `content`, indicating that the `content` is the last in the turn.
+  let turnComplete: Bool
+
+  // If true, indicates that a client message has interrupted current model
+  // generation. If the client is playing out the content in realtime, this is a
+  // good signal to stop and empty the current queue. If the client is playing
+  // out the content in realtime, this is a good signal to stop and empty the
+  // current playback queue.
+  let interrupted: Bool
+
+  // Metadata specifies sources used to ground generated content.
+  // let groundingMetadata: GroundingMetadata?
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension BidiGenerateContentServerContent: Decodable {
+  enum CodingKeys: CodingKey {
+    case modelTurn
+    case turnComplete
+    case interrupted
+  }
+
+  init(from decoder: any Decoder) throws {
+    let container = try decoder.container(keyedBy: CodingKeys.self)
+    modelTurn = try container.decodeIfPresent(ModelContent.self, forKey: .modelTurn)
+    turnComplete = try container.decodeIfPresent(Bool.self, forKey: .turnComplete) ?? false
+    interrupted = try container.decodeIfPresent(Bool.self, forKey: .interrupted) ?? false
+  }
+}

+ 64 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentServerMessage.swift

@@ -0,0 +1,64 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Response message for BidiGenerateContent RPC call.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+enum BidiGenerateContentServerMessage {
+  // Sent in response to a `BidiGenerateContentSetup` message from the client.
+  case setupComplete
+
+  // Content generated by the model in response to client messages.
+  case serverContent(content: BidiGenerateContentServerContent)
+
+  // Request for the client to execute the `function_calls` and return the
+  // responses with the matching `id`s.
+  // case toolCall(BidiGenerateContentToolCall)
+
+  // Notification for the client that a previously issued
+  // `ToolCallMessage` with the specified `id`s should have been not executed
+  // and should be cancelled.
+  // case toolCallCancellation(BidiGenerateContentToolCallCancellation)
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension BidiGenerateContentServerMessage: Decodable {
+  enum CodingKeys: CodingKey {
+    case setupComplete
+    case serverContent
+  }
+
+  init(from decoder: any Decoder) throws {
+    let container = try decoder.container(keyedBy: CodingKeys.self)
+    if let _ = try container.decodeIfPresent(
+      BidiGenerateContentSetupComplete.self,
+      forKey: .setupComplete
+    ) {
+      self = .setupComplete
+    } else if let serverContent = try container.decodeIfPresent(
+      BidiGenerateContentServerContent.self,
+      forKey: .serverContent
+    ) {
+      self = .serverContent(content: serverContent)
+    } else {
+      throw DecodingError.typeMismatch(
+        BidiGenerateContentServerMessage.self,
+        DecodingError.Context(
+          codingPath: container.codingPath,
+          debugDescription: "Unsupported BidiGenerateContentServerMessage type.",
+          underlyingError: nil
+        )
+      )
+    }
+  }
+}

+ 65 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentSetup.swift

@@ -0,0 +1,65 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Message to be sent in the first and only first
+// `BidiGenerateContentClientMessage`. Contains configuration that will apply
+// for the duration of the streaming RPC.
+//
+// Clients should wait for a `BidiGenerateContentSetupComplete` message before
+// sending any additional messages.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+public struct BidiGenerateContentSetup {
+  // The fully qualified name of the publisher model or tuned model endpoint
+  // to use.
+  //
+  // Publisher model format:
+  // `projects/{project}/locations/{location}/publishers/*/models/*`
+  let model: String
+
+  // Generation config.
+  //
+  // The following fields aren't supported:
+  //
+  //  - `response_logprobs`
+  //  - `response_mime_type`
+  //  - `logprobs`
+  //  - `response_schema`
+  //  - `stop_sequence`
+  //  - `routing_config`
+  //  - `audio_timestamp`
+  let generationConfig: GenerationConfig?
+
+  // The user provided system instructions for the model.
+  // Note: only text should be used in parts and content in each part will be
+  // in a separate paragraph.
+  let systemInstruction: ModelContent?
+
+  // A list of `Tools` the model may use to generate the next response.
+  //
+  // A `Tool` is a piece of code that enables the system to interact with
+  // external systems to perform an action, or set of actions, outside of
+  // knowledge and scope of the model.
+  let tools: [Tool]?
+
+  public init(model: String, generationConfig: GenerationConfig? = nil,
+              systemInstruction: ModelContent? = nil, tools: [Tool]? = nil) {
+    self.model = model
+    self.generationConfig = generationConfig
+    self.systemInstruction = systemInstruction
+    self.tools = tools
+  }
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension BidiGenerateContentSetup: Encodable {}

+ 18 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentSetupComplete.swift

@@ -0,0 +1,18 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Sent in response to a `BidiGenerateContentSetup` message from the client.
+struct BidiGenerateContentSetupComplete {}
+
+extension BidiGenerateContentSetupComplete: Decodable {}

+ 30 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/BidiGenerateContentToolResponse.swift

@@ -0,0 +1,30 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Client generated response to a `ToolCall` received from the server.
+// Individual `FunctionResponse` objects are matched to the respective
+// `FunctionCall` objects by the `id` field.
+//
+// Note that in the unary and server-streaming GenerateContent APIs function
+// calling happens by exchanging the `Content` parts, while in the bidi
+// GenerateContent APIs function calling happens over these dedicated set of
+// messages.
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+struct BidiGenerateContentToolResponse {
+  // The response to the function calls.
+  let functionResponses: [FunctionResponse]
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension BidiGenerateContentToolResponse: Encodable {}

+ 248 - 0
FirebaseVertexAI/Sources/Types/Public/MultimodalLive/MultimodalLiveModel.swift

@@ -0,0 +1,248 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+public class MultimodalLiveModel: NSObject {
+  let host = "daily-firebaseml.sandbox.googleapis.com"
+  let modelName: String
+  let projectID: String
+  let modelURI: String
+  let urlRequest: URLRequest
+  let decoder = JSONDecoder()
+  let encoder = JSONEncoder()
+  lazy var urlSession = URLSession(
+    configuration: URLSessionConfiguration.default,
+    delegate: self,
+    delegateQueue: nil
+  )
+  lazy var webSocketTask: URLSessionWebSocketTask = urlSession.webSocketTask(with: urlRequest)
+
+  public init(modelName: String, projectID: String, apiKey: String, location: String) {
+    self.modelName = modelName
+    self.projectID = projectID
+    let urlString =
+      "wss://\(host)/ws/google.firebase.machinelearning.v2beta.LlmBidiService/BidiGenerateContent"
+    guard let url = URL(string: urlString) else {
+      fatalError("\(urlString) is not a valid URL.")
+    }
+    var urlRequest = URLRequest(url: url)
+    urlRequest.setValue(apiKey, forHTTPHeaderField: "x-goog-api-key")
+    urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type")
+    self.urlRequest = urlRequest
+
+    modelURI =
+      "projects/\(projectID)/locations/\(location)/publishers/google/models/\(modelName)"
+  }
+
+  deinit {
+    disconnect()
+  }
+
+  public func connect() async {
+    webSocketTask.resume()
+  }
+
+  public func disconnect() {
+    print("Disconnecting...")
+    webSocketTask.cancel(with: .goingAway, reason: nil)
+    urlSession.finishTasksAndInvalidate()
+    print("Disconnected.")
+  }
+
+  func sendInitialSetupMessages() async -> Bool {
+    let setup = BidiGenerateContentClientMessage.setup(
+      BidiGenerateContentSetup(
+        model: modelURI,
+        generationConfig: GenerationConfig(responseModalities: ["TEXT"])
+      )
+    )
+    let setupData: Data?
+    do {
+      setupData = try JSONEncoder().encode(setup)
+    } catch {
+      print("Error encoding BidiGenerateContentSetup.")
+      setupData = nil
+    }
+    guard let setupData else {
+      disconnect()
+      return false
+    }
+    guard let setupDataJSON = String(data: setupData, encoding: .utf8) else {
+      disconnect()
+      return false
+    }
+    do {
+      print("Sending BidiGenerateContentSetup...")
+      try await webSocketTask.send(.data(setupData))
+      print("Sent BidiGenerateContentSetup.")
+      print("BidiGenerateContentSetup JSON: \(setupDataJSON)")
+    } catch {
+      print("Error sending BidiGenerateContentSetup.")
+      disconnect()
+      return false
+    }
+
+    let setupResponse: URLSessionWebSocketTask.Message?
+    do {
+      print("Receiving BidiGenerateContentServerMessage...")
+      setupResponse = try await webSocketTask.receive()
+      print("Received BidiGenerateContentServerMessage.")
+    } catch {
+      print("Error receiving BidiGenerateContentSetupComplete response: \(error)")
+      setupResponse = nil
+    }
+
+    guard let setupResponse else {
+      disconnect()
+      return false
+    }
+    guard case let .data(data) = setupResponse else {
+      print("Received unknown response type: \(setupResponse)")
+      disconnect()
+      return false
+    }
+    guard let serverMessageJSON = String(data: data, encoding: .utf8) else {
+      disconnect()
+      return false
+    }
+    print("BidiGenerateContentServerMessage JSON: \(serverMessageJSON)")
+
+    let serverMessage: BidiGenerateContentServerMessage?
+    do {
+      serverMessage = try decoder.decode(BidiGenerateContentServerMessage.self, from: data)
+    } catch {
+      print("Failed to decode BidiGenerateContentServerMessage: \(error)")
+      serverMessage = nil
+    }
+    guard let serverMessage else {
+      disconnect()
+      return false
+    }
+    guard case .setupComplete = serverMessage else {
+      print("Received unknown server message: \(serverMessage)")
+      disconnect()
+      return false
+    }
+    print("Processed BidiGenerateContentSetupComplete message.")
+    return true
+  }
+
+  func startListening() async {
+    guard case .running = webSocketTask.state else {
+      print("The WebSocket is in an unexpected state: \(webSocketTask.state).")
+      return
+    }
+
+    while webSocketTask.state == .running {
+      let message: URLSessionWebSocketTask.Message?
+      do {
+        print("Waiting for a BidiGenerateContentServerMessage...")
+        message = try await webSocketTask.receive()
+        print("Received BidiGenerateContentServerMessage.")
+      } catch {
+        print("Error receiving BidiGenerateContentServerMessage: \(error)")
+        message = nil
+      }
+
+      guard let message else {
+        disconnect()
+        return
+      }
+      guard case let .data(data) = message else {
+        print("Received unexpected message type: \(message)")
+        disconnect()
+        return
+      }
+      guard let messageJSON = String(data: data, encoding: .utf8) else {
+        print("Failed to decode BidiGenerateContentServerMessage as JSON.")
+        disconnect()
+        return
+      }
+
+      let serverMessage: BidiGenerateContentServerMessage?
+      do {
+        serverMessage = try decoder.decode(BidiGenerateContentServerMessage.self, from: data)
+      } catch {
+        print("Failed to decode BidiGenerateContentServerMessage: \(error)")
+        print("BidiGenerateContentServerMessage JSON: \(messageJSON)")
+        serverMessage = nil
+      }
+      guard let serverMessage else {
+        disconnect()
+        return
+      }
+      print("Decoded BidiGenerateContentServerMessage: \(serverMessage)")
+      print("BidiGenerateContentServerMessage JSON: \(messageJSON)")
+    }
+  }
+
+  public func sendAudioMessage(audioData: Data) async {
+    let audioPart = InlineData(data: audioData, mimeType: "audio/pcm")
+    let realtimeInput = BidiGenerateContentRealtimeInput(mediaChunks: [audioPart])
+    let clientMessage = BidiGenerateContentClientMessage.realtimeInput(realtimeInput)
+
+    let messageData: Data?
+    do {
+      messageData = try JSONEncoder().encode(clientMessage)
+    } catch {
+      print("Error encoding BidiGenerateContentClientMessage.")
+      messageData = nil
+    }
+    guard let messageData else {
+      disconnect()
+      return
+    }
+    guard let messageJSON = String(data: messageData, encoding: .utf8) else {
+      print("Failed to convert BidiGenerateContentClientMessage to JSON.")
+      disconnect()
+      return
+    }
+    do {
+      print("Sending BidiGenerateContentClientMessage JSON: \(messageJSON)")
+      try await webSocketTask.send(.data(messageData))
+      print("Sent BidiGenerateContentClientMessage.")
+    } catch {
+      print("Error sending BidiGenerateContentClientMessage.")
+      disconnect()
+      return
+    }
+
+    print("Start listening for a response...")
+    await startListening()
+  }
+}
+
+@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *)
+extension MultimodalLiveModel: URLSessionWebSocketDelegate {
+  public func urlSession(_ session: URLSession,
+                         webSocketTask: URLSessionWebSocketTask,
+                         didOpenWithProtocol protocol: String?) {
+    print("WebSocket opened.")
+    Task {
+      print("Sending initial setup messages after WebSocket opened.")
+      if await sendInitialSetupMessages() == false {
+        print("Setup failed.")
+      }
+    }
+  }
+
+  public func urlSession(_ session: URLSession,
+                         webSocketTask: URLSessionWebSocketTask,
+                         didCloseWith closeCode: URLSessionWebSocketTask.CloseCode,
+                         reason: Data?) {
+    print("WebSocket closed with code: \(closeCode)")
+  }
+}

+ 12 - 0
FirebaseVertexAI/Sources/VertexAI.swift

@@ -136,6 +136,18 @@ public class VertexAI {
     )
   }
 
+  public func multimodalLiveModel(modelName: String = "gemini-2.0-flash-001",
+                                  generationConfig: GenerationConfig =
+                                    GenerationConfig(responseModalities: ["TEXT"]))
+    -> MultimodalLiveModel {
+    return MultimodalLiveModel(
+      modelName: modelName,
+      projectID: projectID,
+      apiKey: apiKey,
+      location: location
+    )
+  }
+
   /// Class to enable VertexAI to register via the Objective-C based Firebase component system
   /// to include VertexAI in the userAgent.
   @objc(FIRVertexAIComponent) class FirebaseVertexAIComponent: NSObject {}