This example demonstrates how to extract texts from PDF document.
When there is a need to extract text from pdf document which often is the most common requirement
Choose a pdf file using file browser from which text need to be extracted
After clicking the generate button a text file launches which contains the extracted text with the name sample.txt
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Text;
using OfficeComponent.Pdf;
using OfficeComponent.Pdf.Graphics;
namespace OfficeComponent.Samples
{
class TextExtractorExample : ExampleBase
#if WEB
, IUIExample
#endif
{
public string SourcePdf
{
get;
set;
}
public TextExtractorExample(string commonDataPath, string outputDir)
: base(commonDataPath, outputDir)
{
}
public TextExtractorExample(string commonDataPath, string outputDir, string xmlFile) : base(commonDataPath, outputDir, xmlFile)
{
}
public override string Execute()
{
#if WEB
ProcessForm();
#endif
if (string.IsNullOrWhiteSpace(SourcePdf))
{
ShowError("Please specify a PDF document.");
return null;
}
// Create a new instance of PdfDocument class.
PdfDocument doc = new PdfDocument();
var OutputFile = OutputDir + "\\sample.txt";
// Load an existing PDF
using (PdfImportedDocument ldoc = new PdfImportedDocument(SourcePdf))
{
// Loading Page collections
PdfImportedPageCollection loadedPages = ldoc.Pages;
string s = "";
// Extract text from PDF document pages
foreach (PdfImportedPage lpage in loadedPages)
{
s += lpage.ExtractText();
}
//Convert the string to byte array
byte[] b = (new UnicodeEncoding()).GetBytes(s);
// OutputFile = Util.OutputDir + "\\sample.txt";
// Writing the byte array to text file
System.IO.FileStream fStream = System.IO.File.Create(OutputFile);
fStream.Write(b, 0, b.Length);
fStream.Close();
}
return OutputFile;
}
public override string ActionTitle
{
get { return "Extract Text"; }
}
#if WEB
void ProcessForm()
{
SourcePdf = GetPostFile("SourceFile");
}
#endif
}
}
Imports System.Drawing.Imaging
Imports System.IO
Imports System.Text
Imports OfficeComponent.Pdf
Imports OfficeComponent.Pdf.Graphics
Namespace OfficeComponent.Samples
#If WEB Then
Friend Class TextExtractorExample
Inherits ExampleBase
Implements IUIExample
#Else
Friend Class TextExtractorExample
Inherits ExampleBase
#End If
Private privateSourcePdf As String
Public Property SourcePdf() As String
Get
Return privateSourcePdf
End Get
Set(ByVal value As String)
privateSourcePdf = value
End Set
End Property
Public Sub New(ByVal commonDataPath As String, ByVal outputDir As String)
MyBase.New(commonDataPath, outputDir)
End Sub
Public Sub New(ByVal commonDataPath As String, ByVal outputDir As String, ByVal xmlFile As String)
MyBase.New(commonDataPath, outputDir, xmlFile)
End Sub
Public Overrides Function Execute() As String
#If WEB Then
ProcessForm()
#End If
If String.IsNullOrWhiteSpace(SourcePdf) Then
ShowError("Please specify a PDF document.")
Return Nothing
End If
' Create a new instance of PdfDocument class.
Dim doc As New PdfDocument()
Dim OutputFile = OutputDir & "\sample.txt"
' Load an existing PDF
Using ldoc As New PdfImportedDocument(SourcePdf)
' Loading Page collections
Dim loadedPages As PdfImportedPageCollection = ldoc.Pages
Dim s As String = ""
' Extract text from PDF document pages
For Each lpage As PdfImportedPage In loadedPages
s &= lpage.ExtractText()
Next lpage
'Convert the string to byte array
Dim b() As Byte = (New UnicodeEncoding()).GetBytes(s)
' OutputFile = Util.OutputDir + "\\sample.txt";
' Writing the byte array to text file
Dim fStream As System.IO.FileStream = System.IO.File.Create(OutputFile)
fStream.Write(b, 0, b.Length)
fStream.Close()
End Using
Return OutputFile
End Function
Public Overrides ReadOnly Property ActionTitle() As String
Get
Return "Extract Text"
End Get
End Property
#If WEB Then
Private Sub ProcessForm()
SourcePdf = GetPostFile("SourceFile")
End Sub
#End If
End Class
End Namespace