GetPageText method
<< Click to Display Table of Contents >> Navigation: Component Description > Methods > GetPageText method |
Declaration
function GetPageText(PageNo: Integer; format: string = ''): string;
Description
This function retrieves the text of a certain text as an ANSI string.
PageNo is 0 based.
You can specify the format You need:
"ANSI" - Ansitext
"HTML" - HTML with CSS styles
"XYHTM" or "XYHTML" - HTML with CSS styles - each characters will be placed directly using absolute CSS positions. Pages are separated by <page/> - since this not supported by HTML reader, it is best to export the text page by page - otherwise you will see overprinted text.
Use command COMPDF_GetTextSetOptions (=272) to modify the extracted text:
Set bit 2 if you need y position written as text baseline position (=default) or clear bit 2 to save the top position
"XML" - simplified XML code. The text is exported encoded into UTF8 format. <?xml...> tags are suppressed, to make it easier to append the text.
The following tags are used:
<table>, <tr>, <td> are used to separate tables from the text
<page units="pt" n="pagenumber 0..x" w="width" h="height"> </page> encloses one page
<text ff="fontface" fs="fontsize" x="xpos" y="ypos" fc="fontcolor"> </text> encloses text
i="1" will be written for italic text, b="1" will be used for bold text.
The engine will combine consecutive characters into one <text> tag encoded to UTF8.
"RTF" - RTF code
The method is implemented like this:
function TWPViewPDF.GetPageText(PageNo: Integer; format: string = ''):AnsiString;
var
len: Integer;
begin
len := CommandStrEx(COMPDF_GetTextLen, format, PageNo);
SetLength(Result, len);
if len > 0 then
CommandEx(COMPDF_GetTextBuf, Cardinal(PAnsiChar(Result)));
end;
It is possible to limit the area where the text is extracted by specifying a rectangle, x,y,x1,y1.
Please note that the values are measured in 72dpi and are not using any rotation which may be applied to the PDF page.
pdf.command(COMPDF_GetTextSetOptions, 4+2); // Activate the filter
pdf.command(COMPDF_GetTextFilterRectX, x ); // Left and Top Values
pdf.command(COMPDF_GetTextFilterRectY, y );
pdf.command(COMPDF_GetTextFilterRectX1, x1 ); // Right and Bottom values
pdf.command(COMPDF_GetTextFilterRecty1, y1 );
Don't forget to call pdf.command(COMPDF_GetTextSetOptions, 2) to deactivate the filter when you are done.
Example:
// event handler for OnSelRect
procedure TForm1.SelRectEventToDrawrectangleandextracttext(Sender: TObject; const PageNr: Integer; R: TRect);
var WPViewPDF : TWPViewPDF;
s : string;
begin
WPViewPDF:= (Sender as TWPViewPDF);
WPViewPDF.OnSelRectEvent := nil;
try
WPViewPDF.command(COMPDF_GetTextSetOptions, 4+2); // Activate the filter
WPViewPDF.command(COMPDF_GetTextFilterRectX , r.Left );
WPViewPDF.command(COMPDF_GetTextFilterRectY , r.Top );
WPViewPDF.command(COMPDF_GetTextFilterRectX1, r.Right );
WPViewPDF.command(COMPDF_GetTextFilterRecty1, r.Bottom );
s := WPViewPDF.GetPageText(PageNr);
// WPViewPDF.AddHighlightRect(0, r.Left, r.Top, r.Width, r.Height , 255, [] );
ShowMessage( s ); // 1...
finally
WPViewPDF.command(COMPDF_GetTextSetOptions, 2); // DE-Activate the filter
end;
end;
// on button click ....
procedure TForm1.Drawrectangleandextracttext_Click(Sender: TObject);
begin
if pdf<>nil then
begin
pdf.OnSelRectEvent := SelRectEventToDrawrectangleandextracttext;
pdf.CommandEx(COMPDF_SelectMode, 2); // let the user draw ...
end;
end;