Skip to content

Commit 8766a24

Browse files
author
Ivo Bellin Salarin
committed
feat: copy transcript and timestamps
1 parent bef740f commit 8766a24

File tree

16 files changed

+568
-13
lines changed

16 files changed

+568
-13
lines changed

Recap.xcodeproj/project.pbxproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,8 +597,8 @@
597597
isa = XCRemoteSwiftPackageReference;
598598
repositoryURL = "https://github.com/argmaxinc/WhisperKit.git";
599599
requirement = {
600-
branch = main;
601-
kind = branch;
600+
kind = upToNextMajorVersion;
601+
minimumVersion = 0.9.0;
602602
};
603603
};
604604
A743B0892E3D479600785BFF /* XCRemoteSwiftPackageReference "swift-markdown-ui" */ = {

Recap.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Recap/DataModels/RecapDataModel.xcdatamodeld/RecapDataModel.xcdatamodel/contents

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
<attribute name="state" attributeType="Integer 16" defaultValueString="0" usesScalarValueType="YES"/>
3636
<attribute name="summaryText" optional="YES" attributeType="String"/>
3737
<attribute name="transcriptionText" optional="YES" attributeType="String"/>
38+
<attribute name="timestampedTranscriptionData" optional="YES" attributeType="Binary"/>
3839
<fetchIndex name="byCreatedAt">
3940
<fetchIndexElement property="createdAt" type="Binary" order="descending"/>
4041
</fetchIndex>

Recap/Repositories/Models/RecordingInfo.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ struct RecordingInfo: Identifiable, Equatable {
1212
let applicationName: String?
1313
let transcriptionText: String?
1414
let summaryText: String?
15+
let timestampedTranscription: TimestampedTranscription?
1516
let createdAt: Date
1617
let modifiedAt: Date
1718

@@ -50,6 +51,13 @@ extension RecordingInfo {
5051
self.applicationName = entity.applicationName
5152
self.transcriptionText = entity.transcriptionText
5253
self.summaryText = entity.summaryText
54+
55+
// Decode timestamped transcription data if available
56+
if let data = entity.timestampedTranscriptionData {
57+
self.timestampedTranscription = try? JSONDecoder().decode(TimestampedTranscription.self, from: data)
58+
} else {
59+
self.timestampedTranscription = nil
60+
}
5361
self.createdAt = entity.createdAt ?? Date()
5462
self.modifiedAt = entity.modifiedAt ?? Date()
5563
}

Recap/Repositories/Recordings/RecordingRepository.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,26 @@ final class RecordingRepository: RecordingRepositoryType {
139139
}
140140
}
141141

142+
func updateRecordingTimestampedTranscription(id: String, timestampedTranscription: TimestampedTranscription) async throws {
143+
try await withCheckedThrowingContinuation { continuation in
144+
coreDataManager.performBackgroundTask { context in
145+
do {
146+
let recording = try self.fetchRecordingEntity(id: id, context: context)
147+
148+
// Encode the timestamped transcription to binary data
149+
let data = try JSONEncoder().encode(timestampedTranscription)
150+
recording.timestampedTranscriptionData = data
151+
recording.modifiedAt = Date()
152+
153+
try context.save()
154+
continuation.resume()
155+
} catch {
156+
continuation.resume(throwing: error)
157+
}
158+
}
159+
}
160+
}
161+
142162
func updateRecordingSummary(id: String, summaryText: String) async throws {
143163
try await withCheckedThrowingContinuation { continuation in
144164
coreDataManager.performBackgroundTask { context in

Recap/Repositories/Recordings/RecordingRepositoryType.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ protocol RecordingRepositoryType {
1414
func updateRecordingState(id: String, state: RecordingProcessingState, errorMessage: String?) async throws
1515
func updateRecordingEndDate(id: String, endDate: Date) async throws
1616
func updateRecordingTranscription(id: String, transcriptionText: String) async throws
17+
func updateRecordingTimestampedTranscription(id: String, timestampedTranscription: TimestampedTranscription) async throws
1718
func updateRecordingSummary(id: String, summaryText: String) async throws
1819
func updateRecordingURLs(id: String, recordingURL: URL?, microphoneURL: URL?) async throws
1920
func deleteRecording(id: String) async throws

Recap/Services/Processing/ProcessingCoordinator.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,14 @@ final class ProcessingCoordinator: ProcessingCoordinatorType {
126126
transcriptionText: transcriptionResult.combinedText
127127
)
128128

129+
// Save timestamped transcription data if available
130+
if let timestampedTranscription = transcriptionResult.timestampedTranscription {
131+
try await recordingRepository.updateRecordingTimestampedTranscription(
132+
id: recording.id,
133+
timestampedTranscription: timestampedTranscription
134+
)
135+
}
136+
129137
try await updateRecordingState(recording.id, state: .transcribed)
130138

131139
return transcriptionResult.combinedText
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import Foundation
2+
3+
/// Represents a single segment of transcribed text with timing information
4+
struct TranscriptionSegment: Equatable, Codable {
5+
let text: String
6+
let startTime: TimeInterval
7+
let endTime: TimeInterval
8+
let source: AudioSource
9+
10+
/// The audio source this segment came from
11+
enum AudioSource: String, CaseIterable, Codable {
12+
case systemAudio = "system_audio"
13+
case microphone = "microphone"
14+
}
15+
16+
/// Duration of this segment
17+
var duration: TimeInterval {
18+
endTime - startTime
19+
}
20+
21+
/// Check if this segment overlaps with another segment
22+
func overlaps(with other: TranscriptionSegment) -> Bool {
23+
return startTime < other.endTime && endTime > other.startTime
24+
}
25+
26+
/// Check if this segment occurs before another segment
27+
func isBefore(_ other: TranscriptionSegment) -> Bool {
28+
return endTime <= other.startTime
29+
}
30+
31+
/// Check if this segment occurs after another segment
32+
func isAfter(_ other: TranscriptionSegment) -> Bool {
33+
return startTime >= other.endTime
34+
}
35+
}
36+
37+
/// Collection of transcription segments with utility methods for merging and sorting
38+
struct TimestampedTranscription: Equatable, Codable {
39+
let segments: [TranscriptionSegment]
40+
let totalDuration: TimeInterval
41+
42+
init(segments: [TranscriptionSegment]) {
43+
self.segments = segments.sorted { $0.startTime < $1.startTime }
44+
self.totalDuration = segments.map { $0.endTime }.max() ?? 0
45+
}
46+
47+
/// Get all segments from a specific audio source
48+
func segments(from source: TranscriptionSegment.AudioSource) -> [TranscriptionSegment] {
49+
return segments.filter { $0.source == source }
50+
}
51+
52+
/// Get segments within a specific time range
53+
func segments(in timeRange: ClosedRange<TimeInterval>) -> [TranscriptionSegment] {
54+
return segments.filter { segment in
55+
segment.startTime <= timeRange.upperBound && segment.endTime >= timeRange.lowerBound
56+
}
57+
}
58+
59+
/// Merge with another timestamped transcription, interleaving by time
60+
func merged(with other: TimestampedTranscription) -> TimestampedTranscription {
61+
let allSegments = segments + other.segments
62+
return TimestampedTranscription(segments: allSegments)
63+
}
64+
65+
/// Get a simple text representation (current behavior)
66+
var combinedText: String {
67+
return segments.map { $0.text }.joined(separator: " ")
68+
}
69+
70+
/// Get a formatted text representation with timestamps
71+
var formattedText: String {
72+
return segments.map { segment in
73+
let startMinutes = Int(segment.startTime) / 60
74+
let startSeconds = Int(segment.startTime) % 60
75+
let endMinutes = Int(segment.endTime) / 60
76+
let endSeconds = Int(segment.endTime) % 60
77+
78+
return "[\(String(format: "%02d:%02d", startMinutes, startSeconds))-\(String(format: "%02d:%02d", endMinutes, endSeconds))] [\(segment.source.rawValue)] \(segment.text)"
79+
}.joined(separator: "\n")
80+
}
81+
82+
/// Get segments grouped by source
83+
var segmentsBySource: [TranscriptionSegment.AudioSource: [TranscriptionSegment]] {
84+
return Dictionary(grouping: segments) { $0.source }
85+
}
86+
}

Recap/Services/Transcription/TranscriptionService.swift

Lines changed: 73 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,37 @@ final class TranscriptionService: TranscriptionServiceType {
2525
throw TranscriptionError.modelNotAvailable
2626
}
2727

28+
// Get both text and timestamped segments
2829
let systemAudioText = try await transcribeAudioFile(audioURL, with: whisperKit)
30+
let systemAudioSegments = try await transcribeAudioFileWithTimestamps(audioURL, with: whisperKit, source: .systemAudio)
2931

3032
var microphoneText: String?
33+
var microphoneSegments: [TranscriptionSegment] = []
34+
3135
if let microphoneURL = microphoneURL,
3236
FileManager.default.fileExists(atPath: microphoneURL.path) {
3337
microphoneText = try await transcribeAudioFile(microphoneURL, with: whisperKit)
38+
microphoneSegments = try await transcribeAudioFileWithTimestamps(microphoneURL, with: whisperKit, source: .microphone)
3439
}
3540

3641
let combinedText = buildCombinedText(
3742
systemAudioText: systemAudioText,
3843
microphoneText: microphoneText
3944
)
4045

46+
// Create timestamped transcription by merging segments
47+
let allSegments = systemAudioSegments + microphoneSegments
48+
let timestampedTranscription = TimestampedTranscription(segments: allSegments)
49+
4150
let duration = Date().timeIntervalSince(startTime)
4251

4352
return TranscriptionResult(
4453
systemAudioText: systemAudioText,
4554
microphoneText: microphoneText,
4655
combinedText: combinedText,
4756
transcriptionDuration: duration,
48-
modelUsed: modelName
57+
modelUsed: modelName,
58+
timestampedTranscription: timestampedTranscription
4959
)
5060
}
5161

@@ -67,34 +77,54 @@ final class TranscriptionService: TranscriptionServiceType {
6777

6878
private func loadModel(_ modelName: String, isDownloaded: Bool) async throws {
6979
do {
80+
print("Loading WhisperKit model: \(modelName), isDownloaded: \(isDownloaded)")
81+
82+
// Always try to download/load the model, as WhisperKit will handle caching
83+
// The isDownloaded flag is just for UI purposes, but WhisperKit manages its own cache
7084
let newWhisperKit = try await WhisperKit.createWithProgress(
7185
model: modelName,
7286
modelRepo: "argmaxinc/whisperkit-coreml",
7387
modelFolder: nil,
74-
download: true,
88+
download: true, // Always allow download, WhisperKit will use cache if available
7589
progressCallback: { progress in
76-
// todo: notify UI?
7790
print("WhisperKit download progress: \(progress.fractionCompleted)")
7891
}
7992
)
8093

94+
print("WhisperKit model loaded successfully: \(modelName)")
8195
self.whisperKit = newWhisperKit
8296
self.loadedModelName = modelName
8397

98+
// Mark as downloaded in our repository if not already marked
8499
if !isDownloaded {
85-
try await whisperModelRepository.markAsDownloaded(name: modelName, sizeInMB: nil)
100+
let modelInfo = await WhisperKit.getModelSizeInfo(for: modelName)
101+
try await whisperModelRepository.markAsDownloaded(name: modelName, sizeInMB: Int64(modelInfo.totalSizeMB))
102+
print("Model marked as downloaded: \(modelName), size: \(modelInfo.totalSizeMB) MB")
86103
}
87104

88105
} catch {
89-
throw TranscriptionError.modelLoadingFailed(error.localizedDescription)
106+
print("Failed to load WhisperKit model \(modelName): \(error)")
107+
throw TranscriptionError.modelLoadingFailed("Failed to load model \(modelName): \(error.localizedDescription)")
90108
}
91109
}
92110

93111
private func transcribeAudioFile(_ url: URL, with whisperKit: WhisperKit) async throws -> String {
94112
do {
95-
let transcriptionResults = try await whisperKit.transcribe(audioPath: url.path)
113+
let options = DecodingOptions(
114+
task: .transcribe,
115+
language: nil, // Auto-detect language
116+
withoutTimestamps: false, // We want timestamps
117+
wordTimestamps: false // We don't need word-level timestamps for basic transcription
118+
)
119+
120+
let results = try await whisperKit.transcribe(audioPath: url.path, decodeOptions: options)
121+
let result = results.first
96122

97-
let text = transcriptionResults
123+
guard let segments = result?.segments else {
124+
return ""
125+
}
126+
127+
let text = segments
98128
.map { $0.text.trimmingCharacters(in: .whitespacesAndNewlines) }
99129
.filter { !$0.isEmpty }
100130
.joined(separator: " ")
@@ -106,6 +136,42 @@ final class TranscriptionService: TranscriptionServiceType {
106136
}
107137
}
108138

139+
private func transcribeAudioFileWithTimestamps(_ url: URL, with whisperKit: WhisperKit, source: TranscriptionSegment.AudioSource) async throws -> [TranscriptionSegment] {
140+
do {
141+
let options = DecodingOptions(
142+
task: .transcribe,
143+
language: nil, // Auto-detect language
144+
withoutTimestamps: false, // We want timestamps
145+
wordTimestamps: true // Enable word timestamps for precise timing
146+
)
147+
148+
let results = try await whisperKit.transcribe(audioPath: url.path, decodeOptions: options)
149+
let result = results.first
150+
151+
guard let segments = result?.segments else {
152+
return []
153+
}
154+
155+
// Convert WhisperKit segments to our TranscriptionSegment format
156+
let transcriptionSegments = segments.compactMap { segment -> TranscriptionSegment? in
157+
let text = segment.text.trimmingCharacters(in: .whitespacesAndNewlines)
158+
guard !text.isEmpty else { return nil }
159+
160+
return TranscriptionSegment(
161+
text: text,
162+
startTime: TimeInterval(segment.start),
163+
endTime: TimeInterval(segment.end),
164+
source: source
165+
)
166+
}
167+
168+
return transcriptionSegments
169+
170+
} catch {
171+
throw TranscriptionError.transcriptionFailed(error.localizedDescription)
172+
}
173+
}
174+
109175
private func buildCombinedText(systemAudioText: String, microphoneText: String?) -> String {
110176
var combinedText = systemAudioText
111177

Recap/Services/Transcription/TranscriptionServiceType.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,25 @@ struct TranscriptionResult: Equatable {
1313
let combinedText: String
1414
let transcriptionDuration: TimeInterval
1515
let modelUsed: String
16+
17+
// New timestamped transcription data
18+
let timestampedTranscription: TimestampedTranscription?
19+
20+
init(
21+
systemAudioText: String,
22+
microphoneText: String?,
23+
combinedText: String,
24+
transcriptionDuration: TimeInterval,
25+
modelUsed: String,
26+
timestampedTranscription: TimestampedTranscription? = nil
27+
) {
28+
self.systemAudioText = systemAudioText
29+
self.microphoneText = microphoneText
30+
self.combinedText = combinedText
31+
self.transcriptionDuration = transcriptionDuration
32+
self.modelUsed = modelUsed
33+
self.timestampedTranscription = timestampedTranscription
34+
}
1635
}
1736

1837
enum TranscriptionError: LocalizedError {

0 commit comments

Comments
 (0)