Code samples

Code samples for VintaSoft Imaging .NET SDK. Here you can request a code sample.

Board index < VintaSoft Imaging < VintaSoft Imaging .NET SDK and Plug-ins Discussions < Code samples

We are migrating to new forums engine, no new registration or posting currently available. TIA for your patience.

Console: Convert an image-only PDF document to a searchable PDF document.



Console: Convert an image-only PDF document to a searchable PDF document.

Post by Alex »

For converting an image-only PDF document to a searchable PDF document it is necessary to do the following steps:
  • Render image-only PDF page with 300 dpi resolution or higher
  • Prepare rendered image for text recognition if necessary. For example, you can remove noise from image.
  • Recognize text on image.
  • Filter the recognition result if necessary.
  • Add text over image on PDF page.

  • Here is C# example that shows how to convert an image-only PDF document to a searchable PDF document:
    using System;
    using System.Collections.Generic;
    
    namespace ConsoleApplication1
    {
        class Program
        {
            static void Main(string[] args)
            {
                // convert an image-only PDF document to a searchable PDF document
                ConvertImageOnlyPdfToSearchablePdf(
                    "imageOnlyPdfDocument.pdf",
                    Vintasoft.Imaging.Ocr.OcrLanguage.English,
                    "searchablePdfDocument.pdf");
            }
    
            /// <summary>
            /// Converts an image-only PDF document to a searchable PDF document.
            /// </summary>
            /// <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
            /// <param name="ocrLanguage">OCR language.</param>
            /// <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
            public static void ConvertImageOnlyPdfToSearchablePdf(
                string imageOnlyPdfFilename,
                Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
                string searchablePdfFilename)
            {
                // create an image collection
                using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
                {
                    // add pages from image-only PDF document into image collection
                    images.Add(imageOnlyPdfFilename);
    
                    // create a searchable PDF document
                    using (Vintasoft.Imaging.Pdf.PdfDocument document =
                        new Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                    {
                        Console.WriteLine("Create OCR engine...");
                        // create the Tesseract OCR engine
                        using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
                            new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(@"..\..\TesseractOCR\"))
                        {
                            Console.WriteLine("Initialize OCR engine...");
                            // init the Tesseract OCR engine
                            tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));
    
                            // create a PDF document builder
                            Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
                                new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
                            // specify that the best image compression must be calculated automatically
                            documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                            // specify that image must be place over text
                            documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
    
                            // for each image in image collection
                            foreach (Vintasoft.Imaging.VintasoftImage image in images)
                            {
                                Console.WriteLine("Recognize text in image...");
                                // recognize text on image
                                Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.Recognize(image);
    
                                // remove low confidence words from OCR result
                                RemoveLowConfidenceWords(ocrPage);
    
                                Console.WriteLine("Add page to a PDF document...");
                                // add recognized OCR page to the PDF document
                                documentBuilder.AddPage(image, ocrPage);
                            }
    
                            // shutdown OCR engine
                            tesseractOcr.Shutdown();
    
                            Console.WriteLine("Save changes in PDF document...");
                            // save changes in PDF document
                            document.SaveChanges();
                        }
                    }
    
                    // clear and dispose images in image collection
                    images.ClearAndDisposeItems();
                }
            }
    
            /// <summary>
            /// Preprocess an image before text recognition.
            /// </summary>
            /// <param name="image">Image to preprocess.</param>
            static void PreprocessImageBeforeOcr(Vintasoft.Imaging.VintasoftImage image)
            {
                // remove noise from image
                Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand despeckleCommand =
                    new Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand();
                despeckleCommand.ExecuteInPlace(image);
            }
    
            /// <summary>
            /// Removes low confidence words from OCR page.
            /// </summary>
            /// <param name="ocrPage">OCR page.</param>
            static void RemoveLowConfidenceWords(Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage)
            {
                // minimum confidence
                const float MIN_CONFIDENCE = 75.0f;
    
                // get all words in recognized text
                Vintasoft.Imaging.Ocr.Results.OcrObject[] ocrObjects =
                    ocrPage.GetObjects(Vintasoft.Imaging.Ocr.OcrObjectType.Word);
                // create list of words to remove
                List<Vintasoft.Imaging.Ocr.Results.OcrObject> removeObjects =
                    new List<Vintasoft.Imaging.Ocr.Results.OcrObject>();
                // for each word
                foreach (Vintasoft.Imaging.Ocr.Results.OcrObject word in ocrObjects)
                {
                    // if word confidence is less than minimum confidence
                    if (word.Confidence < MIN_CONFIDENCE)
                        // add word to a list of words to remove
                        removeObjects.Add(word);
                }
    
                // validate recognition results (remove words with low confidence)
    
                Vintasoft.Imaging.Ocr.Results.OcrResultsEditor editor =
                    new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(ocrPage);
                editor.RemoveObjects(removeObjects.ToArray());
                editor.ValidateResults();
            }
    
        }
    }
    
    Source codes of console application for VintaSoft Imaging .NET SDK 14.0 can be downloaded from here.


    Page 1 from 1: 1