Text Extractor

This example demonstrates how to extract texts from PDF document.

Source PDF file:

When there is a need to extract text from pdf document which often is the most common requirement

Choose a pdf file using file browser from which text need to be extracted

After clicking the generate button a text file launches which contains the extracted text with the name sample.txt

using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Text;

using OfficeComponent.Pdf;
using OfficeComponent.Pdf.Graphics;

namespace OfficeComponent.Samples
{
    class TextExtractorExample : ExampleBase
#if WEB
, IUIExample
#endif
    {
        public string SourcePdf
        {
            get;
            set;
        }

        public TextExtractorExample(string commonDataPath, string outputDir)
            : base(commonDataPath, outputDir)
        {

        }

        public TextExtractorExample(string commonDataPath, string outputDir, string xmlFile) : base(commonDataPath, outputDir, xmlFile)
        {

        }

        public override string Execute()
        {
#if WEB
            ProcessForm();
#endif

            if (string.IsNullOrWhiteSpace(SourcePdf))
            {
                ShowError("Please specify a PDF document.");
                return null;
            }

            // Create a new instance of PdfDocument class.
            PdfDocument doc = new PdfDocument();
            
            var OutputFile = OutputDir + "\\sample.txt";
            // Load an existing PDF
            using (PdfImportedDocument ldoc = new PdfImportedDocument(SourcePdf))
            {

                // Loading Page collections
                PdfImportedPageCollection loadedPages = ldoc.Pages;

                string s = "";

                // Extract text from PDF document pages
                foreach (PdfImportedPage lpage in loadedPages)
                {
                    s += lpage.ExtractText();
                }

                //Convert the string to byte array
                byte[] b = (new UnicodeEncoding()).GetBytes(s);

                // OutputFile = Util.OutputDir + "\\sample.txt";

                // Writing the byte array to text file
                System.IO.FileStream fStream = System.IO.File.Create(OutputFile);
                fStream.Write(b, 0, b.Length);
                fStream.Close();
            }

            return OutputFile;
        }

        public override string ActionTitle
        {
            get { return "Extract Text"; }
        }

#if WEB
        void ProcessForm()
        {
            SourcePdf = GetPostFile("SourceFile");
        }
#endif
    }

}
Imports System.Drawing.Imaging
Imports System.IO
Imports System.Text

Imports OfficeComponent.Pdf
Imports OfficeComponent.Pdf.Graphics

Namespace OfficeComponent.Samples
#If WEB Then
	Friend Class TextExtractorExample
		Inherits ExampleBase
		Implements IUIExample
#Else
	Friend Class TextExtractorExample
		Inherits ExampleBase
#End If
		Private privateSourcePdf As String
		Public Property SourcePdf() As String
			Get
				Return privateSourcePdf
			End Get
			Set(ByVal value As String)
				privateSourcePdf = value
			End Set
		End Property

		Public Sub New(ByVal commonDataPath As String, ByVal outputDir As String)
			MyBase.New(commonDataPath, outputDir)

		End Sub

		Public Sub New(ByVal commonDataPath As String, ByVal outputDir As String, ByVal xmlFile As String)
			MyBase.New(commonDataPath, outputDir, xmlFile)

		End Sub

		Public Overrides Function Execute() As String
#If WEB Then
			ProcessForm()
#End If

			If String.IsNullOrWhiteSpace(SourcePdf) Then
				ShowError("Please specify a PDF document.")
				Return Nothing
			End If

			' Create a new instance of PdfDocument class.
			Dim doc As New PdfDocument()

			Dim OutputFile = OutputDir & "\sample.txt"
			' Load an existing PDF
			Using ldoc As New PdfImportedDocument(SourcePdf)

				' Loading Page collections
				Dim loadedPages As PdfImportedPageCollection = ldoc.Pages

				Dim s As String = ""

				' Extract text from PDF document pages
				For Each lpage As PdfImportedPage In loadedPages
					s &= lpage.ExtractText()
				Next lpage

				'Convert the string to byte array
				Dim b() As Byte = (New UnicodeEncoding()).GetBytes(s)

				' OutputFile = Util.OutputDir + "\\sample.txt";

				' Writing the byte array to text file
				Dim fStream As System.IO.FileStream = System.IO.File.Create(OutputFile)
				fStream.Write(b, 0, b.Length)
				fStream.Close()
			End Using

			Return OutputFile
		End Function

		Public Overrides ReadOnly Property ActionTitle() As String
			Get
				Return "Extract Text"
			End Get
		End Property

#If WEB Then
		Private Sub ProcessForm()
			SourcePdf = GetPostFile("SourceFile")
		End Sub
#End If
	End Class

End Namespace