From 0f7fe62faecf38b4436cd4cca0ef9ac246a073ea Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 20:51:05 +0000 Subject: [PATCH 1/6] Initial plan From ed3d08284e8d58e558241d17102660bee0c3d6ac Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:06:06 +0000 Subject: [PATCH 2/6] Use DataContent from Microsoft.Extensions.AI for data URI creation - Added reference to Microsoft.Extensions.AI.Abstractions - Replaced manual CreateDataUri method with DataContent class - Use MemoryStream.GetBuffer() with Length instead of ToArray() to avoid array allocations - All existing tests pass Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../MarkItDownMcpReader.cs | 24 ++++++++----------- ...Extensions.DataIngestion.MarkItDown.csproj | 1 + 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs index b75fc2e7f50..1365448d3ca 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs @@ -6,6 +6,7 @@ using System.IO; using System.Threading; using System.Threading.Tasks; +using Microsoft.Extensions.AI; using Microsoft.Shared.Diagnostics; using ModelContextProtocol.Client; using ModelContextProtocol.Protocol; @@ -42,7 +43,7 @@ public override async Task ReadAsync(FileInfo source, string throw new FileNotFoundException("The specified file does not exist.", source.FullName); } - // Read file content as base64 data URI + // Read file content and create data URI using DataContent #if NET byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); #else @@ -54,7 +55,9 @@ public override async Task ReadAsync(FileInfo source, string fileBytes = ms.ToArray(); } #endif - string dataUri = CreateDataUri(fileBytes, mediaType); + string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; + DataContent dataContent = new(fileBytes, mimeType); + string dataUri = dataContent.Uri; string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); @@ -67,30 +70,23 @@ public override async Task ReadAsync(Stream source, string id _ = Throw.IfNull(source); _ = Throw.IfNullOrEmpty(identifier); - // Read stream content as base64 data URI + // Read stream content and create data URI using DataContent using MemoryStream ms = new(); #if NET await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); #else await source.CopyToAsync(ms).ConfigureAwait(false); #endif - byte[] fileBytes = ms.ToArray(); - string dataUri = CreateDataUri(fileBytes, mediaType); + ReadOnlyMemory data = new(ms.GetBuffer(), 0, (int)ms.Length); + string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType; + DataContent dataContent = new(data, mimeType); + string dataUri = dataContent.Uri; string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); return MarkdownParser.Parse(markdown, identifier); } -#pragma warning disable S3995 // URI return values should not be strings - private static string CreateDataUri(byte[] fileBytes, string? mediaType) -#pragma warning restore S3995 // URI return values should not be strings - { - string base64Content = Convert.ToBase64String(fileBytes); - string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; - return $"data:{mimeType};base64,{base64Content}"; - } - private async Task ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken) { // Create HTTP client transport for MCP diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj index 4699db509a8..ee229c118da 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj @@ -17,6 +17,7 @@ + From 90ee687bbe0511a56b38891e16182f05cd7e11f4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 22:06:02 +0000 Subject: [PATCH 3/6] Address PR feedback: remove redundant reference and simplify code - Remove explicit Microsoft.Extensions.AI.Abstractions reference (available transitively) - Change fileBytes from byte[] to ReadOnlyMemory to avoid ToArray() - Simplify Stream reading code with inline DataContent construction Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../MarkItDownMcpReader.cs | 13 ++++++------- ...osoft.Extensions.DataIngestion.MarkItDown.csproj | 1 - 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs index 1365448d3ca..981373406f9 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs @@ -45,14 +45,14 @@ public override async Task ReadAsync(FileInfo source, string // Read file content and create data URI using DataContent #if NET - byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); + ReadOnlyMemory fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); #else - byte[] fileBytes; + ReadOnlyMemory fileBytes; using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous)) { using MemoryStream ms = new(); await fs.CopyToAsync(ms).ConfigureAwait(false); - fileBytes = ms.ToArray(); + fileBytes = ms.GetBuffer().AsMemory(0, (int)ms.Length); } #endif string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; @@ -77,10 +77,9 @@ public override async Task ReadAsync(Stream source, string id #else await source.CopyToAsync(ms).ConfigureAwait(false); #endif - ReadOnlyMemory data = new(ms.GetBuffer(), 0, (int)ms.Length); - string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType; - DataContent dataContent = new(data, mimeType); - string dataUri = dataContent.Uri; + string dataUri = new DataContent( + ms.GetBuffer().AsMemory(0, (int)ms.Length), + string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType).Uri; string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj index ee229c118da..4699db509a8 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj @@ -17,7 +17,6 @@ - From c8a2cfeba85afb11890135a60e04d43554f4fe26 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 22:19:42 +0000 Subject: [PATCH 4/6] Optimize MemoryStream allocation with pre-sizing - Seed MemoryStream with FileStream.Length for FileInfo overload - Seed MemoryStream with source.Length when source.CanSeek for Stream overload - Consolidate DataContent construction into single statement for FileInfo overload Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../MarkItDownMcpReader.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs index 981373406f9..6a61587bbe1 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs @@ -50,14 +50,14 @@ public override async Task ReadAsync(FileInfo source, string ReadOnlyMemory fileBytes; using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous)) { - using MemoryStream ms = new(); + using MemoryStream ms = new((int)fs.Length); await fs.CopyToAsync(ms).ConfigureAwait(false); fileBytes = ms.GetBuffer().AsMemory(0, (int)ms.Length); } #endif - string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; - DataContent dataContent = new(fileBytes, mimeType); - string dataUri = dataContent.Uri; + string dataUri = new DataContent( + fileBytes, + string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!).Uri; string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); @@ -71,7 +71,7 @@ public override async Task ReadAsync(Stream source, string id _ = Throw.IfNullOrEmpty(identifier); // Read stream content and create data URI using DataContent - using MemoryStream ms = new(); + using MemoryStream ms = source.CanSeek ? new((int)source.Length) : new(); #if NET await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); #else From c76000f3b4fc1a6fe41c303925770488946d2b98 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 22:33:00 +0000 Subject: [PATCH 5/6] Use Math.Min for safe int casting of stream lengths - Replace (int)fs.Length with (int)Math.Min(int.MaxValue, fs.Length) - Replace (int)source.Length with (int)Math.Min(int.MaxValue, source.Length) - Prevents overflow when dealing with files/streams larger than int.MaxValue Co-authored-by: stephentoub <2642209+stephentoub@users.noreply.github.com> --- .../MarkItDownMcpReader.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs index 6a61587bbe1..fbb3b566192 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs @@ -50,7 +50,7 @@ public override async Task ReadAsync(FileInfo source, string ReadOnlyMemory fileBytes; using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous)) { - using MemoryStream ms = new((int)fs.Length); + using MemoryStream ms = new((int)Math.Min(int.MaxValue, fs.Length)); await fs.CopyToAsync(ms).ConfigureAwait(false); fileBytes = ms.GetBuffer().AsMemory(0, (int)ms.Length); } @@ -71,7 +71,7 @@ public override async Task ReadAsync(Stream source, string id _ = Throw.IfNullOrEmpty(identifier); // Read stream content and create data URI using DataContent - using MemoryStream ms = source.CanSeek ? new((int)source.Length) : new(); + using MemoryStream ms = source.CanSeek ? new((int)Math.Min(int.MaxValue, source.Length)) : new(); #if NET await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); #else From 31cd994a4ad4d48317e05050c883109930c77665 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 10 Nov 2025 13:34:20 +0000 Subject: [PATCH 6/6] Defer data URI materialization until after MCP client creation - Pass DataContent to ConvertToMarkdownAsync instead of string URI - Materialize dataContent.Uri only after MCP client is successfully created - Avoids unnecessary string allocation if connection fails Co-authored-by: adamsitnik <6011991+adamsitnik@users.noreply.github.com> --- .../MarkItDownMcpReader.cs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs index fbb3b566192..e6a14bfbf17 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs @@ -43,7 +43,7 @@ public override async Task ReadAsync(FileInfo source, string throw new FileNotFoundException("The specified file does not exist.", source.FullName); } - // Read file content and create data URI using DataContent + // Read file content and create DataContent #if NET ReadOnlyMemory fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); #else @@ -55,11 +55,11 @@ public override async Task ReadAsync(FileInfo source, string fileBytes = ms.GetBuffer().AsMemory(0, (int)ms.Length); } #endif - string dataUri = new DataContent( + DataContent dataContent = new( fileBytes, - string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!).Uri; + string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!); - string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); + string markdown = await ConvertToMarkdownAsync(dataContent, cancellationToken).ConfigureAwait(false); return MarkdownParser.Parse(markdown, identifier); } @@ -70,23 +70,23 @@ public override async Task ReadAsync(Stream source, string id _ = Throw.IfNull(source); _ = Throw.IfNullOrEmpty(identifier); - // Read stream content and create data URI using DataContent + // Read stream content and create DataContent using MemoryStream ms = source.CanSeek ? new((int)Math.Min(int.MaxValue, source.Length)) : new(); #if NET await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); #else await source.CopyToAsync(ms).ConfigureAwait(false); #endif - string dataUri = new DataContent( + DataContent dataContent = new( ms.GetBuffer().AsMemory(0, (int)ms.Length), - string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType).Uri; + string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType); - string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); + string markdown = await ConvertToMarkdownAsync(dataContent, cancellationToken).ConfigureAwait(false); return MarkdownParser.Parse(markdown, identifier); } - private async Task ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken) + private async Task ConvertToMarkdownAsync(DataContent dataContent, CancellationToken cancellationToken) { // Create HTTP client transport for MCP HttpClientTransport transport = new(new HttpClientTransportOptions @@ -104,7 +104,7 @@ private async Task ConvertToMarkdownAsync(string dataUri, CancellationTo // Build parameters for convert_to_markdown tool Dictionary parameters = new() { - ["uri"] = dataUri + ["uri"] = dataContent.Uri }; // Call the convert_to_markdown tool