RecapAI
diff --git a/‎Recap.xcodeproj/project.pbxproj‎
Lines changed: 2 additions & 2 deletions b/‎Recap.xcodeproj/project.pbxproj‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Recap.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved‎
Lines changed: 3 additions & 3 deletions b/‎Recap.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Recap/DataModels/RecapDataModel.xcdatamodeld/RecapDataModel.xcdatamodel/contents‎
Lines changed: 1 addition & 0 deletions b/‎Recap/DataModels/RecapDataModel.xcdatamodeld/RecapDataModel.xcdatamodel/contents‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Recap/Repositories/Models/RecordingInfo.swift‎
Lines changed: 8 additions & 0 deletions b/‎Recap/Repositories/Models/RecordingInfo.swift‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Recap/Repositories/Recordings/RecordingRepository.swift‎
Lines changed: 20 additions & 0 deletions b/‎Recap/Repositories/Recordings/RecordingRepository.swift‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎Recap/Repositories/Recordings/RecordingRepositoryType.swift‎
Lines changed: 1 addition & 0 deletions b/‎Recap/Repositories/Recordings/RecordingRepositoryType.swift‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Recap/Services/Processing/ProcessingCoordinator.swift‎
Lines changed: 8 additions & 0 deletions b/‎Recap/Services/Processing/ProcessingCoordinator.swift‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Recap/Services/Transcription/Models/TranscriptionSegment.swift‎
Lines changed: 86 additions & 0 deletions b/‎Recap/Services/Transcription/Models/TranscriptionSegment.swift‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎Recap/Services/Transcription/TranscriptionService.swift‎
Lines changed: 73 additions & 7 deletions b/‎Recap/Services/Transcription/TranscriptionService.swift‎
Lines changed: 73 additions & 7 deletions
diff --git a/‎Recap/Services/Transcription/TranscriptionServiceType.swift‎
Lines changed: 19 additions & 0 deletions b/‎Recap/Services/Transcription/TranscriptionServiceType.swift‎
Lines changed: 19 additions & 0 deletions
@@ -597,8 +597,8 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/argmaxinc/WhisperKit.git";
 			requirement = {
-				branch = main;
-				kind = branch;
+				kind = upToNextMajorVersion;
+				minimumVersion = 0.9.0;
 			};
 		};
 		A743B0892E3D479600785BFF /* XCRemoteSwiftPackageReference "swift-markdown-ui" */ = {
 
@@ -35,6 +35,7 @@
         <attribute name="state" attributeType="Integer 16" defaultValueString="0" usesScalarValueType="YES"/>
         <attribute name="summaryText" optional="YES" attributeType="String"/>
         <attribute name="transcriptionText" optional="YES" attributeType="String"/>
+        <attribute name="timestampedTranscriptionData" optional="YES" attributeType="Binary"/>
         <fetchIndex name="byCreatedAt">
             <fetchIndexElement property="createdAt" type="Binary" order="descending"/>
         </fetchIndex>
 
@@ -12,6 +12,7 @@ struct RecordingInfo: Identifiable, Equatable {
     let applicationName: String?
     let transcriptionText: String?
     let summaryText: String?
+    let timestampedTranscription: TimestampedTranscription?
     let createdAt: Date
     let modifiedAt: Date
 
@@ -50,6 +51,13 @@ extension RecordingInfo {
         self.applicationName = entity.applicationName
         self.transcriptionText = entity.transcriptionText
         self.summaryText = entity.summaryText
+        
+        // Decode timestamped transcription data if available
+        if let data = entity.timestampedTranscriptionData {
+            self.timestampedTranscription = try? JSONDecoder().decode(TimestampedTranscription.self, from: data)
+        } else {
+            self.timestampedTranscription = nil
+        }
         self.createdAt = entity.createdAt ?? Date()
         self.modifiedAt = entity.modifiedAt ?? Date()
     }
 
@@ -139,6 +139,26 @@ final class RecordingRepository: RecordingRepositoryType {
         }
     }
 
+    func updateRecordingTimestampedTranscription(id: String, timestampedTranscription: TimestampedTranscription) async throws {
+        try await withCheckedThrowingContinuation { continuation in
+            coreDataManager.performBackgroundTask { context in
+                do {
+                    let recording = try self.fetchRecordingEntity(id: id, context: context)
+                    
+                    // Encode the timestamped transcription to binary data
+                    let data = try JSONEncoder().encode(timestampedTranscription)
+                    recording.timestampedTranscriptionData = data
+                    recording.modifiedAt = Date()
+                    
+                    try context.save()
+                    continuation.resume()
+                } catch {
+                    continuation.resume(throwing: error)
+                }
+            }
+        }
+    }
+    
     func updateRecordingSummary(id: String, summaryText: String) async throws {
         try await withCheckedThrowingContinuation { continuation in
             coreDataManager.performBackgroundTask { context in
 
@@ -14,6 +14,7 @@ protocol RecordingRepositoryType {
     func updateRecordingState(id: String, state: RecordingProcessingState, errorMessage: String?) async throws
     func updateRecordingEndDate(id: String, endDate: Date) async throws
     func updateRecordingTranscription(id: String, transcriptionText: String) async throws
+    func updateRecordingTimestampedTranscription(id: String, timestampedTranscription: TimestampedTranscription) async throws
     func updateRecordingSummary(id: String, summaryText: String) async throws
     func updateRecordingURLs(id: String, recordingURL: URL?, microphoneURL: URL?) async throws
     func deleteRecording(id: String) async throws
 
@@ -126,6 +126,14 @@ final class ProcessingCoordinator: ProcessingCoordinatorType {
             transcriptionText: transcriptionResult.combinedText
         )
 
+        // Save timestamped transcription data if available
+        if let timestampedTranscription = transcriptionResult.timestampedTranscription {
+            try await recordingRepository.updateRecordingTimestampedTranscription(
+                id: recording.id,
+                timestampedTranscription: timestampedTranscription
+            )
+        }
+        
         try await updateRecordingState(recording.id, state: .transcribed)
 
         return transcriptionResult.combinedText
 
@@ -0,0 +1,86 @@
+import Foundation
+
+/// Represents a single segment of transcribed text with timing information
+struct TranscriptionSegment: Equatable, Codable {
+    let text: String
+    let startTime: TimeInterval
+    let endTime: TimeInterval
+    let source: AudioSource
+    
+    /// The audio source this segment came from
+    enum AudioSource: String, CaseIterable, Codable {
+        case systemAudio = "system_audio"
+        case microphone = "microphone"
+    }
+    
+    /// Duration of this segment
+    var duration: TimeInterval {
+        endTime - startTime
+    }
+    
+    /// Check if this segment overlaps with another segment
+    func overlaps(with other: TranscriptionSegment) -> Bool {
+        return startTime < other.endTime && endTime > other.startTime
+    }
+    
+    /// Check if this segment occurs before another segment
+    func isBefore(_ other: TranscriptionSegment) -> Bool {
+        return endTime <= other.startTime
+    }
+    
+    /// Check if this segment occurs after another segment
+    func isAfter(_ other: TranscriptionSegment) -> Bool {
+        return startTime >= other.endTime
+    }
+}
+
+/// Collection of transcription segments with utility methods for merging and sorting
+struct TimestampedTranscription: Equatable, Codable {
+    let segments: [TranscriptionSegment]
+    let totalDuration: TimeInterval
+    
+    init(segments: [TranscriptionSegment]) {
+        self.segments = segments.sorted { $0.startTime < $1.startTime }
+        self.totalDuration = segments.map { $0.endTime }.max() ?? 0
+    }
+    
+    /// Get all segments from a specific audio source
+    func segments(from source: TranscriptionSegment.AudioSource) -> [TranscriptionSegment] {
+        return segments.filter { $0.source == source }
+    }
+    
+    /// Get segments within a specific time range
+    func segments(in timeRange: ClosedRange<TimeInterval>) -> [TranscriptionSegment] {
+        return segments.filter { segment in
+            segment.startTime <= timeRange.upperBound && segment.endTime >= timeRange.lowerBound
+        }
+    }
+    
+    /// Merge with another timestamped transcription, interleaving by time
+    func merged(with other: TimestampedTranscription) -> TimestampedTranscription {
+        let allSegments = segments + other.segments
+        return TimestampedTranscription(segments: allSegments)
+    }
+    
+    /// Get a simple text representation (current behavior)
+    var combinedText: String {
+        return segments.map { $0.text }.joined(separator: " ")
+    }
+    
+    /// Get a formatted text representation with timestamps
+    var formattedText: String {
+        return segments.map { segment in
+            let startMinutes = Int(segment.startTime) / 60
+            let startSeconds = Int(segment.startTime) % 60
+            let endMinutes = Int(segment.endTime) / 60
+            let endSeconds = Int(segment.endTime) % 60
+            
+            return "[\(String(format: "%02d:%02d", startMinutes, startSeconds))-\(String(format: "%02d:%02d", endMinutes, endSeconds))] [\(segment.source.rawValue)] \(segment.text)"
+        }.joined(separator: "\n")
+    }
+    
+    /// Get segments grouped by source
+    var segmentsBySource: [TranscriptionSegment.AudioSource: [TranscriptionSegment]] {
+        return Dictionary(grouping: segments) { $0.source }
+    }
+}
@@ -25,27 +25,37 @@ final class TranscriptionService: TranscriptionServiceType {
             throw TranscriptionError.modelNotAvailable
         }
 
+        // Get both text and timestamped segments
         let systemAudioText = try await transcribeAudioFile(audioURL, with: whisperKit)
+        let systemAudioSegments = try await transcribeAudioFileWithTimestamps(audioURL, with: whisperKit, source: .systemAudio)
 
         var microphoneText: String?
+        var microphoneSegments: [TranscriptionSegment] = []
+        
         if let microphoneURL = microphoneURL,
            FileManager.default.fileExists(atPath: microphoneURL.path) {
             microphoneText = try await transcribeAudioFile(microphoneURL, with: whisperKit)
+            microphoneSegments = try await transcribeAudioFileWithTimestamps(microphoneURL, with: whisperKit, source: .microphone)
         }
 
         let combinedText = buildCombinedText(
             systemAudioText: systemAudioText,
             microphoneText: microphoneText
         )
 
+        // Create timestamped transcription by merging segments
+        let allSegments = systemAudioSegments + microphoneSegments
+        let timestampedTranscription = TimestampedTranscription(segments: allSegments)
+        
         let duration = Date().timeIntervalSince(startTime)
 
         return TranscriptionResult(
             systemAudioText: systemAudioText,
             microphoneText: microphoneText,
             combinedText: combinedText,
             transcriptionDuration: duration,
-            modelUsed: modelName
+            modelUsed: modelName,
+            timestampedTranscription: timestampedTranscription
         )
     }
 
@@ -67,34 +77,54 @@ final class TranscriptionService: TranscriptionServiceType {
 
     private func loadModel(_ modelName: String, isDownloaded: Bool) async throws {
         do {
+            print("Loading WhisperKit model: \(modelName), isDownloaded: \(isDownloaded)")
+            
+            // Always try to download/load the model, as WhisperKit will handle caching
+            // The isDownloaded flag is just for UI purposes, but WhisperKit manages its own cache
             let newWhisperKit = try await WhisperKit.createWithProgress(
                 model: modelName,
                 modelRepo: "argmaxinc/whisperkit-coreml",
                 modelFolder: nil,
-                download: true,
+                download: true, // Always allow download, WhisperKit will use cache if available
                 progressCallback: { progress in
-                    // todo: notify UI?
                     print("WhisperKit download progress: \(progress.fractionCompleted)")
                 }
             )
 
+            print("WhisperKit model loaded successfully: \(modelName)")
             self.whisperKit = newWhisperKit
             self.loadedModelName = modelName
 
+            // Mark as downloaded in our repository if not already marked
             if !isDownloaded {
-                try await whisperModelRepository.markAsDownloaded(name: modelName, sizeInMB: nil)
+                let modelInfo = await WhisperKit.getModelSizeInfo(for: modelName)
+                try await whisperModelRepository.markAsDownloaded(name: modelName, sizeInMB: Int64(modelInfo.totalSizeMB))
+                print("Model marked as downloaded: \(modelName), size: \(modelInfo.totalSizeMB) MB")
             }
 
         } catch {
-            throw TranscriptionError.modelLoadingFailed(error.localizedDescription)
+            print("Failed to load WhisperKit model \(modelName): \(error)")
+            throw TranscriptionError.modelLoadingFailed("Failed to load model \(modelName): \(error.localizedDescription)")
         }
     }
 
     private func transcribeAudioFile(_ url: URL, with whisperKit: WhisperKit) async throws -> String {
         do {
-            let transcriptionResults = try await whisperKit.transcribe(audioPath: url.path)
+            let options = DecodingOptions(
+                task: .transcribe,
+                language: nil, // Auto-detect language
+                withoutTimestamps: false, // We want timestamps
+                wordTimestamps: false // We don't need word-level timestamps for basic transcription
+            )
+            
+            let results = try await whisperKit.transcribe(audioPath: url.path, decodeOptions: options)
+            let result = results.first
 
-            let text = transcriptionResults
+            guard let segments = result?.segments else {
+                return ""
+            }
+            
+            let text = segments
                 .map { $0.text.trimmingCharacters(in: .whitespacesAndNewlines) }
                 .filter { !$0.isEmpty }
                 .joined(separator: " ")
@@ -106,6 +136,42 @@ final class TranscriptionService: TranscriptionServiceType {
         }
     }
 
+    private func transcribeAudioFileWithTimestamps(_ url: URL, with whisperKit: WhisperKit, source: TranscriptionSegment.AudioSource) async throws -> [TranscriptionSegment] {
+        do {
+            let options = DecodingOptions(
+                task: .transcribe,
+                language: nil, // Auto-detect language
+                withoutTimestamps: false, // We want timestamps
+                wordTimestamps: true // Enable word timestamps for precise timing
+            )
+            
+            let results = try await whisperKit.transcribe(audioPath: url.path, decodeOptions: options)
+            let result = results.first
+            
+            guard let segments = result?.segments else {
+                return []
+            }
+            
+            // Convert WhisperKit segments to our TranscriptionSegment format
+            let transcriptionSegments = segments.compactMap { segment -> TranscriptionSegment? in
+                let text = segment.text.trimmingCharacters(in: .whitespacesAndNewlines)
+                guard !text.isEmpty else { return nil }
+                
+                return TranscriptionSegment(
+                    text: text,
+                    startTime: TimeInterval(segment.start),
+                    endTime: TimeInterval(segment.end),
+                    source: source
+                )
+            }
+            
+            return transcriptionSegments
+            
+        } catch {
+            throw TranscriptionError.transcriptionFailed(error.localizedDescription)
+        }
+    }
+    
     private func buildCombinedText(systemAudioText: String, microphoneText: String?) -> String {
         var combinedText = systemAudioText
 
 
@@ -13,6 +13,25 @@ struct TranscriptionResult: Equatable {
     let combinedText: String
     let transcriptionDuration: TimeInterval
     let modelUsed: String
+    
+    // New timestamped transcription data
+    let timestampedTranscription: TimestampedTranscription?
+    
+    init(
+        systemAudioText: String,
+        microphoneText: String?,
+        combinedText: String,
+        transcriptionDuration: TimeInterval,
+        modelUsed: String,
+        timestampedTranscription: TimestampedTranscription? = nil
+    ) {
+        self.systemAudioText = systemAudioText
+        self.microphoneText = microphoneText
+        self.combinedText = combinedText
+        self.transcriptionDuration = transcriptionDuration
+        self.modelUsed = modelUsed
+        self.timestampedTranscription = timestampedTranscription
+    }
 }
 
 enum TranscriptionError: LocalizedError {