Procházet zdrojové kódy

feat(ai): Add support for sending videos via Live API. (#15432)

Daymon před 5 měsíci
rodič
revize
be30d46dfc

+ 3 - 0
FirebaseAI/CHANGELOG.md

@@ -2,6 +2,9 @@
 - [fixed] Fixed various links in the Live API doc comments not mapping correctly.
 - [fixed] Fixed minor translation issue for nanosecond conversion when receiving
   `LiveServerGoingAwayNotice`. (#15410)
+- [feature] Added support for sending video frames with the Live API via the `sendVideoRealtime`
+  method on [`LiveSession`](https://firebase.google.com/docs/reference/swift/firebaseai/api/reference/Classes/LiveSession).
+  (#15432)
 
 # 12.4.0
 - [feature] Added support for the URL context tool, which allows the model to access content

+ 15 - 6
FirebaseAI/Sources/Types/Public/Live/LiveSession.swift

@@ -67,15 +67,24 @@ public final class LiveSession: Sendable {
     await service.send(.realtimeInput(message))
   }
 
-  /// Sends a video input stream to the model, using the realtime API.
+  /// Sends a video frame to the model, using the realtime API.
+  ///
+  /// Instead of raw video data, the model expects individual frames of the video,
+  /// sent as images.
+  ///
+  /// If your video has audio, send it seperately through ``LiveSession/sendAudioRealtime(_:)``.
+  ///
+  /// For better performance, frames can also be sent at a lower rate than the video;
+  /// even as low as 1 frame per second.
   ///
   /// - Parameters:
-  ///   - video: Encoded video data, used to update the model on the client's conversation.
-  ///   - format: The format that the video was encoded in (eg; `mp4`, `webm`, `wmv`, etc.,).
-  // TODO: (b/448671945) Make public after testing and next release
-  func sendVideoRealtime(_ video: Data, format: String) async {
+  ///   - video: Encoded image data extracted from a frame of the video, used to update the model on
+  ///     the client's conversation.
+  ///   - mimeType: The IANA standard MIME type of the video frame data (eg; `images/png`,
+  ///     `images/jpeg`etc.,).
+  public func sendVideoRealtime(_ video: Data, mimeType: String) async {
     let message = BidiGenerateContentRealtimeInput(
-      video: InlineData(data: video, mimeType: "video/\(format)")
+      video: InlineData(data: video, mimeType: mimeType)
     )
     await service.send(.realtimeInput(message))
   }

+ 4 - 0
FirebaseAI/Tests/TestApp/FirebaseAITestApp.xcodeproj/project.pbxproj

@@ -7,6 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		0E0481222EA2E51300A50172 /* DataUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0E0481212EA2E51100A50172 /* DataUtils.swift */; };
 		0E460FAB2E9858E4007E26A6 /* LiveSessionTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0E460FAA2E9858E4007E26A6 /* LiveSessionTests.swift */; };
 		0EC8BAE22E98784E0075A4E0 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 868A7C532CCC26B500E449DD /* Assets.xcassets */; };
 		862218812D04E098007ED2D4 /* IntegrationTestUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */; };
@@ -44,6 +45,7 @@
 /* End PBXContainerItemProxy section */
 
 /* Begin PBXFileReference section */
+		0E0481212EA2E51100A50172 /* DataUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DataUtils.swift; sourceTree = "<group>"; };
 		0E460FAA2E9858E4007E26A6 /* LiveSessionTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LiveSessionTests.swift; sourceTree = "<group>"; };
 		862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IntegrationTestUtils.swift; sourceTree = "<group>"; };
 		864F8F702D4980D60002EA7E /* ImagenIntegrationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImagenIntegrationTests.swift; sourceTree = "<group>"; };
@@ -168,6 +170,7 @@
 		8698D7442CD3CEF700ABA833 /* Utilities */ = {
 			isa = PBXGroup;
 			children = (
+				0E0481212EA2E51100A50172 /* DataUtils.swift */,
 				86D77E032D7B6C95003D155D /* InstanceConfig.swift */,
 				862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */,
 			);
@@ -304,6 +307,7 @@
 				DEF0BB512DA9B7450093E9F4 /* SchemaTests.swift in Sources */,
 				DEF0BB4F2DA74F680093E9F4 /* TestHelpers.swift in Sources */,
 				868A7C4F2CCC229F00E449DD /* Credentials.swift in Sources */,
+				0E0481222EA2E51300A50172 /* DataUtils.swift in Sources */,
 				864F8F712D4980DD0002EA7E /* ImagenIntegrationTests.swift in Sources */,
 				862218812D04E098007ED2D4 /* IntegrationTestUtils.swift in Sources */,
 				86D77DFC2D7A5340003D155D /* GenerateContentIntegrationTests.swift in Sources */,

+ 13 - 0
FirebaseAI/Tests/TestApp/Resources/Assets.xcassets/cat.dataset/Contents.json

@@ -0,0 +1,13 @@
+{
+  "data" : [
+    {
+      "filename" : "videoplayback.mp4",
+      "idiom" : "universal",
+      "universal-type-identifier" : "public.mpeg-4"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}

binární
FirebaseAI/Tests/TestApp/Resources/Assets.xcassets/cat.dataset/videoplayback.mp4


+ 51 - 0
FirebaseAI/Tests/TestApp/Tests/Integration/LiveSessionTests.swift

@@ -76,6 +76,14 @@ struct LiveSessionTests {
       role: "system",
       parts: "When you receive a message, if the message is a single word, assume it's the first name of a person, and call the getLastName tool to get the last name of said person. Only respond with the last name."
     )
+
+    static let animalInVideo = ModelContent(
+      role: "system",
+      parts: """
+      Send a one word response of what ANIMAL is in the video. \
+      If you don't receive a video, send "Test is broken, I didn't receive a video.".
+      """.trimmingCharacters(in: .whitespacesAndNewlines)
+    )
   }
 
   @Test(arguments: arguments)
@@ -181,6 +189,49 @@ struct LiveSessionTests {
     #expect(modelResponse == "goodbye")
   }
 
+  @Test(arguments: arguments.filter { $0.1 != ModelNames.gemini2FlashLive })
+  // gemini-2.0-flash-live-001 is buggy and likes to respond to the audio or system instruction
+  // (eg; it will say 'okay' or 'hello', instead of following the instructions)
+  func sendVideoRealtime_receiveText(_ config: InstanceConfig, modelName: String) async throws {
+    let model = FirebaseAI.componentInstance(config).liveModel(
+      modelName: modelName,
+      generationConfig: textConfig,
+      systemInstruction: SystemInstructions.animalInVideo
+    )
+
+    let session = try await model.connect()
+    guard let videoFile = NSDataAsset(name: "cat") else {
+      Issue.record("Missing video file 'cat' in Assets")
+      return
+    }
+
+    let frames = try await videoFile.videoFrames()
+    for frame in frames {
+      await session.sendVideoRealtime(frame, mimeType: "image/png")
+    }
+
+    // the model doesn't respond unless we send some audio too
+    // vertex also responds if you send text, but google ai doesn't
+    // (they both respond with audio though)
+    guard let audioFile = NSDataAsset(name: "hello") else {
+      Issue.record("Missing audio file 'hello.wav' in Assets")
+      return
+    }
+    await session.sendAudioRealtime(audioFile.data)
+    await session.sendAudioRealtime(Data(repeating: 0, count: audioFile.data.count))
+
+    let text = try await session.collectNextTextResponse()
+
+    await session.close()
+    let modelResponse = text
+      .trimmingCharacters(in: .whitespacesAndNewlines)
+      .trimmingCharacters(in: .punctuationCharacters)
+      .lowercased()
+
+    // model response varies
+    #expect(["kitten", "cat", "kitty"].contains(modelResponse))
+  }
+
   @Test(arguments: arguments)
   func realtime_functionCalling(_ config: InstanceConfig, modelName: String) async throws {
     let model = FirebaseAI.componentInstance(config).liveModel(

+ 64 - 0
FirebaseAI/Tests/TestApp/Tests/Utilities/DataUtils.swift

@@ -0,0 +1,64 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import AVFoundation
+import SwiftUI
+
+extension NSDataAsset {
+  /// The preferred file extension for this asset, if any.
+  ///
+  /// This is set in the Asset catalog under the `File Type` field.
+  var fileExtension: String? {
+    UTType(typeIdentifier)?.preferredFilenameExtension
+  }
+
+  /// Extracts `.png` frames from a video at a rate of 1 FPS.
+  ///
+  /// - Returns:
+  ///   An array of `Data` corresponding to individual images for each frame.
+  func videoFrames() async throws -> [Data] {
+    guard let fileExtension else {
+      fatalError(
+        "Failed to find file extension; ensure the \"File Type\" is set in the asset catalog."
+      )
+    }
+
+    // we need a temp file so we can provide a URL to AVURLAsset
+    let tempFileURL = URL(fileURLWithPath: NSTemporaryDirectory())
+      .appendingPathComponent(UUID().uuidString, isDirectory: false)
+      .appendingPathExtension(fileExtension)
+
+    try data.write(to: tempFileURL)
+
+    defer {
+      try? FileManager.default.removeItem(at: tempFileURL)
+    }
+
+    let asset = AVURLAsset(url: tempFileURL)
+    let generator = AVAssetImageGenerator(asset: asset)
+
+    let duration = try await asset.load(.duration).seconds
+    return try stride(from: 0, to: duration, by: 1).map { seconds in
+      let time = CMTime(seconds: seconds, preferredTimescale: 1)
+      let cg = try generator.copyCGImage(at: time, actualTime: nil)
+
+      let image = UIImage(cgImage: cg)
+      guard let png = image.pngData() else {
+        fatalError("Failed to encode image to png")
+      }
+
+      return png
+    }
+  }
+}