iTextSharpでテキストの書式を取得する方法

Question

PDFからテキストコンテンツを読み取るためにiTextSharpを使用しています。それも読めます。しかし、フォントや色などのテキストの書式設定を失っています。その書式設定を取得する方法はありますか？.

以下は私が正確なテキストに使用しているコードセグメントです-

PdfReader reader = new PdfReader("F:\EBooks\AspectsOfAjax.pdf"); textBox1.Text = ExtractTextFromPDFBytes(reader.GetPageContent(1)); private string ExtractTextFromPDFBytes(byte[] input) { if (input == null || input.Length == 0) return ""; try { string resultString = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal e.g. '\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i < input.Length; i++) { char c = (char)input[i]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 && CheckToken( new string[]{"ET"}, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\' && !nextLiteral) { nextLiteral = true; } else { if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for when we have to go back for a checking for (int j = 0; j < _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters)) { inTextObject = true; } } return resultString; } catch { return ""; } } private bool CheckToken(string[] tokens, char[] recent) { foreach(string token in tokens) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) && (recent[_numberOfCharsToKeep - 2] == token[1]) && ((recent[_numberOfCharsToKeep - 1] == ' ') || (recent[_numberOfCharsToKeep - 1] == 0x0d) || (recent[_numberOfCharsToKeep - 1] == 0x0a)) && ((recent[_numberOfCharsToKeep - 4] == ' ') || (recent[_numberOfCharsToKeep - 4] == 0x0d) || (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } return false; }

Chris Haas · Accepted Answer

別の方向に向けてみます。 iTextSharpには、いくつかの基本的なトークンを処理する非常に美しくシンプルなテキスト抽出システムがあります。残念ながら、それは色情報を処理しませんが @ Mark Storerによると、自分で実装するのはそれほど難しくないかもしれません。

編集開始

色情報の実装に着手しました。詳細は私のブログ投稿はこちらを参照してください。（形式が正しくないため申し訳ありませんが、今夕食に向かいます。）

END EDIT

以下のコードは、ここにいくつかの質問と回答を組み合わせていますこれはフォントの高さを取得するためのもの（正確ではありません）だけでなく、別の質問（私の人生ではもう見つけられないようです））偽の太字を検出する方法を示しています。

PostscriptFontNameは、フォント名の前に追加の文字を返します。フォントサブセットを埋め込むときに関係があると思います。

以下は、iTextSharp 5.1.1.0をターゲットとし、テキストをHTMLとして抽出する完全なWinFormsアプリケーションです。

サンプルPDFのスクリーンショット

HTMLとして抽出されたサンプルテキスト

<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Hello </span> <span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">w</span> <span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:37.87201">o</span> <span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">rl</span> <span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">d </span> <br /> <span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Test </span>

コード

using System; using System.Collections.Generic; using System.Text; using System.Windows.Forms; using iTextSharp.text.pdf.parser; using iTextSharp.text.pdf; namespace WindowsFormsApplication2 { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void Form1_Load(object sender, EventArgs e) { PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Document.pdf")); TextWithFontExtractionStategy S = new TextWithFontExtractionStategy(); string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S); Console.WriteLine(F); this.Close(); } public class TextWithFontExtractionStategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy { //HTML buffer private StringBuilder result = new StringBuilder(); //Store last used properties private Vector lastBaseLine; private string lastFont; private float lastFontSize; //http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html private enum TextRenderMode { FillText = 0, StrokeText = 1, FillThenStrokeText = 2, Invisible = 3, FillTextAndAddToPathForClipping = 4, StrokeTextAndAddToPathForClipping = 5, FillThenStrokeTextAndAddToPathForClipping = 6, AddTextToPaddForClipping = 7 } public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo) { string curFont = renderInfo.GetFont().PostscriptFontName; //Check if faux bold is used if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText)) { curFont += "-Bold"; } //This code assumes that if the baseline changes then we're on a newline Vector curBaseline = renderInfo.GetBaseline().GetStartPoint(); Vector topRight = renderInfo.GetAscentLine().GetEndPoint(); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]); Single curFontSize = rect.Height; //See if something has changed, either the baseline, the font or the font size if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont)) { //if we've put down at least one span tag close it if ((this.lastBaseLine != null)) { this.result.AppendLine("</span>"); } //If the baseline has changed then insert a line break if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) { this.result.AppendLine("<br />"); } //Create an HTML tag with appropriate styles this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize); } //Append the current text this.result.Append(renderInfo.GetText()); //Set currently used properties this.lastBaseLine = curBaseline; this.lastFontSize = curFontSize; this.lastFont = curFont; } public string GetResultantText() { //If we wrote anything then we'll always have a missing closing tag so close it here if (result.Length > 0) { result.Append("</span>"); } return result.ToString(); } //Not needed public void BeginTextBlock() { } public void EndTextBlock() { } public void RenderImage(ImageRenderInfo renderInfo) { } } } }

Motasem M. Al-wazir · Answer

@ChrisコードをJavaに変換しました。

import com.itextpdf.text.Rectangle; import com.itextpdf.text.pdf.parser.ImageRenderInfo; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextRenderInfo; import com.itextpdf.text.pdf.parser.Vector; public class TextWithFontExtractionStategy implements TextExtractionStrategy { //HTML buffer private StringBuilder result = new StringBuilder(); //Store last used properties private Vector lastBaseLine; private String lastFont; private float lastFontSize; //http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html private enum TextRenderMode { FillText(0), StrokeText(1), FillThenStrokeText(2), Invisible(3), FillTextAndAddToPathForClipping(4), StrokeTextAndAddToPathForClipping(5), FillThenStrokeTextAndAddToPathForClipping(6), AddTextToPaddForClipping(7); private int value; TextRenderMode(int value) { this.value = value; } public int getValue() { return value; } } public void renderText(TextRenderInfo renderInfo) { String curFont = renderInfo.getFont().getPostscriptFontName(); //Check if faux bold is used if ((renderInfo.getTextRenderMode() == TextRenderMode.FillThenStrokeText.getValue())) { curFont += "-Bold"; } //This code assumes that if the baseline changes then we're on a newline Vector curBaseline = renderInfo.getBaseline().getStartPoint(); Vector topRight = renderInfo.getAscentLine().getEndPoint(); Rectangle rect = new Rectangle(curBaseline.get(Vector.I1), curBaseline.get(Vector.I2), topRight.get(Vector.I1), topRight.get(Vector.I2)); float curFontSize = rect.getHeight(); //See if something has changed, either the baseline, the font or the font size if ((this.lastBaseLine == null) || (curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2)) || (curFontSize != lastFontSize) || (curFont != lastFont)) { //if we've put down at least one span tag close it if ((this.lastBaseLine != null)) { this.result.append("</span>").append("
"); } //If the baseline has changed then insert a line break if ((this.lastBaseLine != null) && curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2)) { this.result.append("<br />").append("
"); } //Create an HTML tag with appropriate styles this.result.append(String.format("<span style=\"font-family:{%s};font-size:{%s}\">", curFont, curFontSize)); } //Append the current text this.result.append(renderInfo.getText() + " "); //Set currently used properties this.lastBaseLine = curBaseline; this.lastFontSize = curFontSize; this.lastFont = curFont; } public String getResultantText() { //If we wrote anything then we'll always have a missing closing tag so close it here if (result.length() > 0) { result.append("</span>"); } return result.toString(); } //Not needed public void beginTextBlock() { } public void endTextBlock() { } public void renderImage(ImageRenderInfo renderInfo) { } }