From 4d13e38136f75085d49015d99265d2571fa5a382 Mon Sep 17 00:00:00 2001 From: venkateshwaransf5013 Date: Fri, 5 Jun 2026 12:39:08 +0530 Subject: [PATCH 1/3] Added the Customize Image saving and configure OCR Processing settings content in Data Extraction UG --- .../NET/conversions/pdf-to-markdown.md | 99 +++++++++++ .../NET/working-with-data-extraction.md | 154 +++++++++++------- .../NET/working-with-form-recognition.md | 5 + 3 files changed, 196 insertions(+), 62 deletions(-) diff --git a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md index 2156093d70..a406b279e1 100644 --- a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md +++ b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md @@ -169,6 +169,105 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endtabs %} +## Customize Image saving using Event + +The ImageNodeVisited event in the Syncfusion® Smart Data Extractor allows users to customize how images are saved during data extraction. With this event, you can: + +* Save images externally (e.g., to disk or cloud storage). +* Replace Base64 content with a file path for optimized storage. +* Control image naming and storage location to meet application requirements. + +### Extract Markdown with external Image saving + +The following code shows how to use the [ExtractDataAsMarkdown](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsMarkdown_System_IO_Stream_) method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class with the ImageNodeVisited event to customize image saving while exporting content as Markdown. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using Syncfusion.Office.Markdown; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF or Image file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Hook the event to customize image handling. + extractor.SaveOptions.ImageNodeVisited += SaveImage; + //Extract Markdown content as string. + string data = extractor.ExtractDataAsMarkdown(inputStream); + //Save the extracted Markdown data into an output file. + File.WriteAllText("DataToMarkdown.md", data); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using Syncfusion.Office.Markdown; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF or Image file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Hook the event to customize image handling. + extractor.SaveOptions.ImageNodeVisited += SaveImage; + //Extract Markdown content as string. + string data = extractor.ExtractDataAsMarkdown(inputStream); + //Save the extracted Markdown data into an output file. + File.WriteAllText("DataToMarkdown.md", data); +} + +{% endhighlight %} + +{% endtabs %} + +The following code shows how to implement the event handler to customize the image path and save images externally. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +//Event handler to save images externally +static void SaveImage(object sender, MdImageNodeVisitedEventArgs args) +{ + //Define output image path (customize naming logic as needed) + string imagePath = @"D:\Temp\Image1.png"; + //Save the image stream to file + using (FileStream fileStreamOutput = File.Create(imagePath)) + { + args.ImageStream.CopyTo(fileStreamOutput); + } + //Set the URI to be used in the Markdown output + args.Uri = imagePath; +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +//Event handler to save images externally +static void SaveImage(object sender, MdImageNodeVisitedEventArgs args) +{ + //Define output image path (customize naming logic as needed) + string imagePath = @"D:\Temp\Image1.png"; + //Save the image stream to file + using (FileStream fileStreamOutput = File.Create(imagePath)) + { + args.ImageStream.CopyTo(fileStreamOutput); + } + //Set the URI to be used in the Markdown output + args.Uri = imagePath; +} + +{% endhighlight %} + +{% endtabs %} + + ## PDF to Markdown Preservation Mapping This section explains how common PDF elements are converted and preserved in Markdown format, ensuring that document structure and formatting remain consistent during the PDF to Markdown conversion process. diff --git a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md index ccf5c36ed3..6e1a3ba4d1 100644 --- a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md +++ b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md @@ -340,14 +340,11 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableFormDetection = false; //Extract form data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } - {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} @@ -364,11 +361,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableFormDetection = false; //Extract form data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -397,14 +392,11 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableTableDetection = false; // Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } - {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} @@ -421,11 +413,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableTableDetection = false; // Extract data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -470,11 +460,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Assign the configured form recognition options to the extractor. extractor.FormRecognizeOptions = formOptions; //Extract form data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -509,11 +497,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Assign the configured form recognition options to the extractor. extractor.FormRecognizeOptions = formOptions; //Extract form data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -552,11 +538,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess // Assign the table extraction options to the extractor. extractor.TableExtractionOptions = tableOptions; // Extract data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -585,11 +569,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess // Assign the table extraction options to the extractor. extractor.TableExtractionOptions = tableOptions; // Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -619,11 +601,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //default confidence threshold value is 0.6 extractor.ConfidenceThreshold = 0.75; // Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -643,11 +623,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //default confidence threshold value is 0.6 extractor.ConfidenceThreshold = 0.75; // Extract data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -675,11 +653,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Set the page range for extraction (pages 1 to 3). extractor.PageRange = new int[,] { { 1, 3 } }; //Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new PDF file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -697,11 +673,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Set the page range for extraction (pages 1 to 3). extractor.PageRange = new int[,] { { 1, 3 } }; //Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -711,5 +685,61 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-within-specific-range/.NET). +## Configure OCR Processing Settings + +To configure OCR settings in .NET using the OCRProcessor property of the [DataExtractor]( https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class use the following C# example to set a page range and extract structured data from a PDF document as JSON output with the [ExtractDataAsJson](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsJson_System_IO_Stream_) method. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using Syncfusion.Pdf.Parsing; +using Syncfusion.OCRProcessor; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Set OCR language. + extractor.OCRProcessor.Settings.Language = Languages.English; + //Set tesseract OCR Engine. + extractor.OCRProcessor.Settings.TesseractVersion = TesseractVersion.Version5_0; + //Extract data and return as a loaded json document. + PdfLoadedDocument pdf = extractor.ExtractDataAsPdfDocument(stream); + //Save the extracted output as a new PDF file. + pdf.Save("Output.pdf"); + //Close the document. + pdf.Close(true); +} +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using Syncfusion.Pdf.Parsing; +using Syncfusion.OCRProcessor; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Set OCR language. + extractor.OCRProcessor.Settings.Language = Languages.English; + //Set tesseract OCR Engine. + extractor.OCRProcessor.Settings.TesseractVersion = TesseractVersion.Version5_0; + //Extract data and return as a loaded json document. + PdfLoadedDocument pdf = extractor.ExtractDataAsPdfDocument(stream); + //Save the extracted output as a new PDF file. + pdf.Save("Output.pdf"); + //Close the document. + pdf.Close(true); +} + +{% endhighlight %} + +{% endtabs %} diff --git a/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md b/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md index 4e76f53b22..26c8203673 100644 --- a/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md +++ b/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md @@ -8,6 +8,11 @@ documentation: UG # Working with Form Recognition +The Syncfusion® Smart Form Recognizer is a .NET library that detects and extracts form fields such as text inputs, checkboxes, and radio buttons along with their types and values. + +To quickly get started with recognizing form data from PDF and image files using the Smart Form Recognizer library, refer to this video tutorial: +{% youtube "https://www.youtube.com/watch?v=1F1jRW3JIB4" %} + ## Recognize Forms as JSON To recognize form data from a PDF or image and get the output as a JSON string using the [RecognizeFormAsJson](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartFormRecognizer.FormRecognizer.html#Syncfusion_SmartFormRecognizer_FormRecognizer_RecognizeFormAsJson_System_IO_Stream_) (synchronous) and [RecognizeFormAsJsonAsync](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartFormRecognizer.FormRecognizer.html#Syncfusion_SmartFormRecognizer_FormRecognizer_RecognizeFormAsJsonAsync_System_IO_Stream_System_Threading_CancellationToken_) (asynchronous) methods of the [FormRecognizer](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartFormRecognizer.FormRecognizer.html) class, refer to the following code examples. From 21024639877fc2c65984d15b0dc6c7ac904d214b Mon Sep 17 00:00:00 2001 From: venkateshwaransf5013 Date: Fri, 5 Jun 2026 16:59:31 +0530 Subject: [PATCH 2/3] Addressed the review feedbacks --- .../NET/conversions/pdf-to-markdown.md | 11 ++++---- .../NET/working-with-data-extraction.md | 27 ++++++++++++------- .../NET/working-with-form-recognition.md | 3 +-- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md index a406b279e1..59849d4a32 100644 --- a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md +++ b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md @@ -169,17 +169,16 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endtabs %} -## Customize Image saving using Event +## Customize image saving -The ImageNodeVisited event in the Syncfusion® Smart Data Extractor allows users to customize how images are saved during data extraction. With this event, you can: +The [ImageNodeVisited](https://help.syncfusion.com/cr/document-processing/Syncfusion.DocIO.DLS.SaveOptions.html#Syncfusion_DocIO_DLS_SaveOptions_ImageNodeVisited) event in the Syncfusion® Smart Data Extractor allows users to customize how images are saved during data extraction. With this event, you can: -* Save images externally (e.g., to disk or cloud storage). +* Customize image names and storage paths, and save images externally. * Replace Base64 content with a file path for optimized storage. -* Control image naming and storage location to meet application requirements. -### Extract Markdown with external Image saving +### Extract Markdown with external image saving -The following code shows how to use the [ExtractDataAsMarkdown](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsMarkdown_System_IO_Stream_) method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class with the ImageNodeVisited event to customize image saving while exporting content as Markdown. +The following code shows how to use the [ExtractDataAsMarkdown](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsMarkdown_System_IO_Stream_) method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class with the [ImageNodeVisited](https://help.syncfusion.com/cr/document-processing/Syncfusion.DocIO.DLS.SaveOptions.html#Syncfusion_DocIO_DLS_SaveOptions_ImageNodeVisited) event to customize image saving while exporting content as Markdown. {% tabs %} diff --git a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md index 6e1a3ba4d1..d27fb3b86b 100644 --- a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md +++ b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md @@ -687,7 +687,8 @@ You can download a complete working sample from [GitHub](https://github.com/Sync ## Configure OCR Processing Settings -To configure OCR settings in .NET using the OCRProcessor property of the [DataExtractor]( https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class use the following C# example to set a page range and extract structured data from a PDF document as JSON output with the [ExtractDataAsJson](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsJson_System_IO_Stream_) method. +To configure OCR settings in .NET using the **OCRProcessor** property of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, use the following C# example to initialize the OCR processor, set language and Tesseract version, and extract structured data from a PDF document with the [ExtractDataAsPdfDocument](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsPdfDocument_System_IO_Stream_) method. + {% tabs %} @@ -702,11 +703,15 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess { //Initialize the Data Extractor. DataExtractor extractor = new DataExtractor(); + //Initialize the OCR processor. + OCRProcessor processor = new OCRProcessor(); //Set OCR language. - extractor.OCRProcessor.Settings.Language = Languages.English; - //Set tesseract OCR Engine. - extractor.OCRProcessor.Settings.TesseractVersion = TesseractVersion.Version5_0; - //Extract data and return as a loaded json document. + processor.Settings.Language = Languages.English; + //Set Tesseract OCR engine version. + processor.Settings.TesseractVersion = TesseractVersion.Version5_0; + //Assign the configured OCR processor to the Data Extractor. + extractor.OCRProcessor = processor; + //Extract data and return as a loaded PDF document. PdfLoadedDocument pdf = extractor.ExtractDataAsPdfDocument(stream); //Save the extracted output as a new PDF file. pdf.Save("Output.pdf"); @@ -727,11 +732,15 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess { //Initialize the Data Extractor. DataExtractor extractor = new DataExtractor(); + //Initialize the OCR processor. + OCRProcessor processor = new OCRProcessor(); //Set OCR language. - extractor.OCRProcessor.Settings.Language = Languages.English; - //Set tesseract OCR Engine. - extractor.OCRProcessor.Settings.TesseractVersion = TesseractVersion.Version5_0; - //Extract data and return as a loaded json document. + processor.Settings.Language = Languages.English; + //Set Tesseract OCR engine version. + processor.Settings.TesseractVersion = TesseractVersion.Version5_0; + //Assign the configured OCR processor to the Data Extractor. + extractor.OCRProcessor = processor; + //Extract data and return as a loaded PDF document. PdfLoadedDocument pdf = extractor.ExtractDataAsPdfDocument(stream); //Save the extracted output as a new PDF file. pdf.Save("Output.pdf"); diff --git a/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md b/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md index 26c8203673..a68a71e331 100644 --- a/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md +++ b/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md @@ -8,8 +8,7 @@ documentation: UG # Working with Form Recognition -The Syncfusion® Smart Form Recognizer is a .NET library that detects and extracts form fields such as text inputs, checkboxes, and radio buttons along with their types and values. - +The Syncfusion® Smart Form Recognizer is a C# library for .NET that reliably extracts form data from PDFs and scanned images. It detects text fields, checkboxes, radio buttons, and signature regions. To quickly get started with recognizing form data from PDF and image files using the Smart Form Recognizer library, refer to this video tutorial: {% youtube "https://www.youtube.com/watch?v=1F1jRW3JIB4" %} From c87565f37302650329519b67654c85f6c788886f Mon Sep 17 00:00:00 2001 From: venkateshwaransf5013 Date: Fri, 5 Jun 2026 19:50:12 +0530 Subject: [PATCH 3/3] Added the markdowndocument content --- .../NET/working-with-data-extraction.md | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md index d27fb3b86b..4379b8cecc 100644 --- a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md +++ b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md @@ -320,6 +320,107 @@ using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileA You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-stream/.NET). +## Convert PDF/Image to Markdown + +The **Smart Data Extractor** enables you to process PDF documents or scanned images and export the structured content as a MarkdownDocument (MD DOM). + +This section covers two scenarios: +* Extracting from PDF +* Extracting from Image + +### Extracting from PDF + +To extract structured data from a PDF document and save it as a Markdown document using the **ExtractDataAsMarkdownDocument** method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, refer to the following code example: + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% endtabs %} + +### Extracting from Image + +To extract structured data from an image file and save it as a Markdown document using the **ExtractDataAsMarkdownDocument** method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, refer to the following C# code example. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input image file as a stream. +using (FileStream stream = new FileStream("Input.png", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input image file as a stream. +using (FileStream stream = new FileStream("Input.png", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% endtabs %} + + ## Disable Form Detection To disable form field detection while extracting structured data from a PDF document using the [ExtractDataAsJson](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsJson_System_IO_Stream_) method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, refer to the following code example: @@ -752,3 +853,5 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endtabs %} + +