Skip to content

Commit c0b753f

Browse files
authored
KnowPro.NET updates (#1762)
* Knowledge Batch extraction * Auto-knowledge extraction * Transcript parsing for Podcasts * Import + index podcast end to end from text file * Language Search: testing end to end * Refactoring * Bug fixes
1 parent 0c2e1c9 commit c0b753f

24 files changed

+655
-91
lines changed

dotnet/typeagent/examples/examplesLib/KnowProWriter.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,15 +247,15 @@ public static async Task WriteScoredSemanticRefsAsync(
247247

248248
if (kType == KnowledgeType.Entity)
249249
{
250-
IList<Scored<ConcreteEntity>> entities = await semanticRefCollection.GetDistinctEntitiesAsync(matchesToDisplay);
251-
for (int i = 0; i < entities.Count; ++i)
250+
IList<Scored<ConcreteEntity>> distinctEntities = await semanticRefCollection.GetDistinctEntitiesAsync(matchesToDisplay);
251+
for (int i = 0; i < distinctEntities.Count; ++i)
252252
{
253253
var pos = isAsc ? matchesToDisplay.Count - (i + 1) : i;
254254
WriteLine(
255255
ConsoleColor.Green,
256-
$"{pos + 1} / {matchesToDisplay.Count}: [{entities[i].Score}]"
256+
$"{pos + 1} / {distinctEntities.Count}: [{distinctEntities[i].Score}]"
257257
);
258-
WriteEntity(entities[i]);
258+
WriteEntity(distinctEntities[i]);
259259
WriteLine();
260260
}
261261
}

dotnet/typeagent/examples/knowProConsole/ConversationEventHandler.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@ public void Subscribe(IConversation conversation)
1919
var secondaryIndexes = conversation.SecondaryIndexes;
2020
secondaryIndexes.TermToRelatedTermsIndex.FuzzyIndex.OnIndexed += this.FuzzyIndex_OnIndexed;
2121
secondaryIndexes.MessageIndex.OnIndexed += this.Message_OnIndexed;
22+
conversation.SemanticRefs.OnKnowledgeExtracted += this.KnowledgeExtractor_OnExtracted;
2223
}
2324

2425
public void Unsubscribe(IConversation conversation)
2526
{
2627
var secondaryIndexes = conversation.SecondaryIndexes;
2728
secondaryIndexes.TermToRelatedTermsIndex.FuzzyIndex.OnIndexed -= this.FuzzyIndex_OnIndexed;
2829
secondaryIndexes.MessageIndex.OnIndexed -= this.Message_OnIndexed;
30+
conversation.SemanticRefs.OnKnowledgeExtracted -= this.KnowledgeExtractor_OnExtracted;
2931
}
3032

3133
private void FuzzyIndex_OnIndexed(BatchProgress item)
@@ -38,6 +40,11 @@ private void Message_OnIndexed(BatchProgress item)
3840
WriteProgress(item, "Message");
3941
}
4042

43+
private void KnowledgeExtractor_OnExtracted(BatchProgress item)
44+
{
45+
WriteProgress(item, "Knowledge");
46+
}
47+
4148
private void WriteProgress(BatchProgress progress, string label)
4249
{
4350
_inplaceUpdate.Write($"[{label}: {progress.CountCompleted} / {progress.Count}]");

dotnet/typeagent/examples/knowProConsole/PodcastCommands.cs

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ private Command PodcastImportIndexDef()
7575
Command cmd = new("kpPodcastImportIndex", "Import existing podcast memory index")
7676
{
7777
Args.Arg<string>("filePath", "Path to existing podcast index"),
78+
Options.Arg<string>("startAt", "ISO date: When the podcast occurred"),
79+
Options.Arg<int>("length", "In minutes"),
80+
Options.Arg<bool>("buildIndex", "Also build index", true)
7881
};
7982
cmd.SetAction(this.PodcastImportIndexAsync);
8083
return cmd;
@@ -84,6 +87,46 @@ private async Task PodcastImportIndexAsync(ParseResult args, CancellationToken c
8487
{
8588
NamedArgs namedArgs = new(args);
8689
string? filePath = namedArgs.Get("filePath");
90+
if (string.IsNullOrEmpty(filePath))
91+
{
92+
return;
93+
}
94+
string ext = Path.GetExtension(filePath);
95+
string podcastName = Path.GetFileNameWithoutExtension(filePath);
96+
if (ext.Equals("json", StringComparison.OrdinalIgnoreCase))
97+
{
98+
await ImportExistingIndexAsync(namedArgs, filePath, podcastName, cancellationToken);
99+
}
100+
else
101+
{
102+
await ImportTranscriptAsync(namedArgs, filePath, podcastName, cancellationToken);
103+
}
104+
}
105+
106+
private async Task ImportTranscriptAsync(NamedArgs namedArgs, string filePath, string podcastName, CancellationToken cancellationToken)
107+
{
108+
UnloadCurrent();
109+
Podcast podcast = CreatePodcast(podcastName, true);
110+
SetCurrent(podcast);
111+
112+
string? startAt = namedArgs.Get<string>("startAt");
113+
DateTimeOffset? startDate = !string.IsNullOrEmpty(startAt) ? DateTimeOffset.Parse(startAt) : null;
114+
115+
await podcast.ImportTranscriptAsync(
116+
filePath,
117+
podcastName,
118+
startDate,
119+
namedArgs.Get<int>("length")
120+
);
121+
122+
if (namedArgs.Get<bool>("buildIndex"))
123+
{
124+
await podcast.UpdateIndexAsync(cancellationToken);
125+
}
126+
}
127+
128+
private async Task ImportExistingIndexAsync(NamedArgs namedArgs, string filePath, string podcastName, CancellationToken cancellationToken)
129+
{
87130
var data = ConversationJsonSerializer.ReadFromFile<PodcastMessage>(filePath!);
88131
if (data is null)
89132
{
@@ -94,7 +137,6 @@ private async Task PodcastImportIndexAsync(ParseResult args, CancellationToken c
94137

95138
UnloadCurrent();
96139

97-
string? podcastName = Path.GetFileNameWithoutExtension(filePath) ?? throw new NotSupportedException();
98140
KnowProWriter.WriteLine(ConsoleColor.Cyan, $"Importing {podcastName}");
99141
var podcast = CreatePodcast(podcastName, true);
100142
try
@@ -129,6 +171,7 @@ private async Task PodcastImportIndexAsync(ParseResult args, CancellationToken c
129171
}
130172
}
131173

174+
132175
private Podcast CreatePodcast(string name, bool createNew)
133176
{
134177
MemorySettings settings = new MemorySettings();

dotnet/typeagent/src/common/Batch.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ namespace TypeAgent.Common;
55

66
public readonly struct BatchProgress
77
{
8+
public BatchProgress()
9+
: this(1, 1)
10+
{
11+
}
12+
813
public BatchProgress(int countCompleted, int count)
914
{
1015
CountCompleted = countCompleted;

dotnet/typeagent/src/common/StringExtensions.cs

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
namespace TypeAgent.Common;
88

9-
public static class StringExtensions
9+
public static partial class StringExtensions
1010
{
1111
/// <summary>
1212
/// Splits an enumerable of strings into chunks, each chunk containing up to maxChunkLength strings and
@@ -59,4 +59,45 @@ public static List<string> LowerAndSort(this List<string> list)
5959
return list;
6060
}
6161

62+
[GeneratedRegex(@"\r?\n", RegexOptions.Compiled)]
63+
private static partial Regex s_lineSplitRegex(); // This is now valid in a partial class
64+
65+
private static readonly Regex s_lineSplitter = s_lineSplitRegex();
66+
67+
public static IList<string> SplitLines(this string text, StringSplitOptions options = default)
68+
=> text.Split(s_lineSplitter, options);
69+
70+
public static IList<string> Split(this string text, Regex regex, StringSplitOptions options = default)
71+
{
72+
ArgumentVerify.ThrowIfNull(regex, nameof(regex));
73+
74+
string[] parts = regex.Split(text);
75+
if (options == StringSplitOptions.None)
76+
{
77+
return parts;
78+
}
79+
if ((options & StringSplitOptions.TrimEntries) != 0)
80+
{
81+
for (int i = 0; i < parts.Length; ++i)
82+
{
83+
parts[i] = parts[i].Trim();
84+
}
85+
}
86+
if ((options & StringSplitOptions.RemoveEmptyEntries) != 0)
87+
{
88+
List<string> filteredParts = new List<string>(parts.Length);
89+
for (int i = 0; i < parts.Length; ++i)
90+
{
91+
if (!string.IsNullOrEmpty(parts[i]))
92+
{
93+
filteredParts.Add(parts[i]);
94+
}
95+
}
96+
return filteredParts;
97+
}
98+
else
99+
{
100+
return parts;
101+
}
102+
}
62103
}

dotnet/typeagent/src/conversationMemory/Memory.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ namespace TypeAgent.ConversationMemory;
66
public class Memory<TMessage> : Conversation<TMessage>, IMemory
77
where TMessage : class, IMessage, new()
88
{
9-
109
public Memory(MemorySettings settings, IStorageProvider<TMessage> storageProvider)
1110
: base(settings.ConversationSettings, storageProvider)
1211
{
@@ -42,7 +41,6 @@ public async ValueTask<IList<ConversationSearchResult>> SearchAsync(
4241
IConversation conversation = this;
4342
return await conversation.SearchAsync(
4443
searchText,
45-
Settings.QueryTranslator,
4644
options,
4745
filter,
4846
debugContext,

dotnet/typeagent/src/conversationMemory/MemorySettings.cs

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,39 +12,29 @@ public MemorySettings()
1212
{
1313
}
1414

15+
public MemorySettings(IChatModel languageModel, ITextEmbeddingModel embeddingModel)
16+
: this(new ConversationSettings(languageModel, embeddingModel))
17+
{
18+
}
19+
1520
public MemorySettings(
1621
ConversationSettings conversationSettings,
1722
int embeddingCacheSize = 64,
18-
IChatModel? chatModel = null,
19-
ITextEmbeddingModel? embeddingModel = null,
2023
NoiseText? noiseTerms = null
2124
)
2225
{
2326
ArgumentVerify.ThrowIfNull(conversationSettings, nameof(conversationSettings));
2427

2528
ConversationSettings = conversationSettings;
26-
ChatModel = chatModel ?? new OpenAIChatModel();
27-
EmbeddingModel = new TextEmbeddingModelWithCache(embeddingCacheSize);
28-
QueryTranslator = new SearchQueryTranslator(ChatModel);
29-
KnowledgeExtractor = new KnowledgeExtractor(ChatModel);
30-
3129
NoiseTerms = noiseTerms ?? new NoiseText(
3230
typeof(MemorySettings).Assembly,
3331
"TypeAgent.ConversationMemory.noiseTerms.txt"
3432
);
3533

3634
}
3735

38-
public IChatModel ChatModel { get; }
39-
40-
public TextEmbeddingModelWithCache EmbeddingModel { get; }
41-
4236
public ConversationSettings ConversationSettings { get; }
4337

44-
public ISearchQueryTranslator? QueryTranslator { get; set; }
45-
46-
public IKnowledgeExtractor KnowledgeExtractor { get; set; }
47-
4838
public NoiseText? NoiseTerms { get; set; }
4939

5040
// Setting this to true leverages Structured Tags
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT License.
3+
4+
namespace TypeAgent.ConversationMemory;
5+
6+
public static class MessageExtensions
7+
{
8+
9+
/// <summary>
10+
/// Assigns timestamps to each message between startDate and endDate proportionally to the number
11+
/// of characters in each message's text chunks. The first message gets startDate; subsequent
12+
/// timestamps advance assuming a constant speaking rate.
13+
/// </summary>
14+
/// <param name="messages">Ordered transcript messages.</param>
15+
/// <param name="startDate">Inclusive start of the time range.</param>
16+
/// <param name="endDate">Exclusive end of the time range (must be greater than startDate).</param>
17+
/// <exception cref="ArgumentNullException">messages is null.</exception>
18+
/// <exception cref="ArgumentException">endDate is not greater than startDate.</exception>
19+
public static void TimestampMessages<TMessage>(this IList<TMessage> messages, DateTimeOffset startDate, DateTimeOffset endDate)
20+
where TMessage : IMessage
21+
{
22+
TimeSpan span = endDate - startDate;
23+
if (span <= TimeSpan.Zero)
24+
{
25+
throw new ArgumentException($"{startDate:o} is not < {endDate:o}", nameof(endDate));
26+
}
27+
28+
int totalChars = 0;
29+
var lengths = new int[messages.Count];
30+
int messageCount = messages.Count;
31+
for (int i = 0; i < messageCount; ++i)
32+
{
33+
int len = messages[i].GetCharCount();
34+
lengths[i] = len;
35+
totalChars += len;
36+
}
37+
38+
if (totalChars == 0)
39+
{
40+
return;
41+
}
42+
43+
double ticksPerChar = (double)span.Ticks / totalChars;
44+
double elapsedTicks = 0.0;
45+
46+
for (int i = 0; i < messages.Count; ++i)
47+
{
48+
// Compute timestamp based on accumulated elapsed ticks.
49+
var dt = startDate.AddTicks((long)elapsedTicks).ToUniversalTime();
50+
messages[i].Timestamp = dt.ToISOString();
51+
elapsedTicks += ticksPerChar * lengths[i];
52+
}
53+
}
54+
}

dotnet/typeagent/src/conversationMemory/Podcast.cs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,47 @@ public Podcast(MemorySettings settings, IStorageProvider<PodcastMessage> provide
99
: base(settings, provider)
1010
{
1111
}
12+
13+
public async ValueTask ImportTranscriptAsync(
14+
string filePath,
15+
string? name = null,
16+
DateTimeOffset? startDate = null,
17+
int? lengthMinutes = null
18+
)
19+
{
20+
// delegate error checking
21+
string text = File.ReadAllText(filePath);
22+
if (string.IsNullOrEmpty(text))
23+
{
24+
return;
25+
}
26+
var (messages, participants) = PodcastMessage.ParseTranscript(text);
27+
AssignMessageListeners(messages, participants);
28+
if (startDate is not null)
29+
{
30+
messages.TimestampMessages(startDate.Value, startDate.Value.AddMinutes(lengthMinutes ?? 60));
31+
}
32+
33+
await Messages.AppendAsync(
34+
messages
35+
).ConfigureAwait(false);
36+
}
37+
38+
private void AssignMessageListeners(IList<PodcastMessage> messages, ISet<string> participants)
39+
{
40+
foreach (var message in messages)
41+
{
42+
string? speaker = message.Metadata?.Speaker;
43+
if (!string.IsNullOrEmpty(speaker))
44+
{
45+
foreach (var participant in participants)
46+
{
47+
if (participant != speaker)
48+
{
49+
message.Metadata.Listeners.Add(participant);
50+
}
51+
}
52+
}
53+
}
54+
}
1255
}

0 commit comments

Comments
 (0)