PDFからテキストコンテンツを読み取るためにiTextSharpを使用しています。それも読めます。しかし、フォントや色などのテキストの書式設定を失っています。その書式設定を取得する方法はありますか?.
以下は私が正確なテキストに使用しているコードセグメントです-
PdfReader reader = new PdfReader("F:\\EBooks\\AspectsOfAjax.pdf");
textBox1.Text = ExtractTextFromPDFBytes(reader.GetPageContent(1));
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 && CheckToken( new string[]{"ET"}, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
private bool CheckToken(string[] tokens, char[] recent)
{
foreach(string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}
別の方向に向けてみます。 iTextSharpには、いくつかの基本的なトークンを処理する非常に美しくシンプルなテキスト抽出システムがあります。残念ながら、それは色情報を処理しませんが @ Mark Storerによると、自分で実装するのはそれほど難しくないかもしれません 。
編集開始
色情報の実装に着手しました。詳細は 私のブログ投稿はこちら を参照してください。 (形式が正しくないため申し訳ありませんが、今夕食に向かいます。)
END EDIT
以下のコードは、ここにいくつかの質問と回答を組み合わせています これはフォントの高さを取得するためのもの (正確ではありません)だけでなく、別の質問(私の人生ではもう見つけられないようです) )偽の太字を検出する方法を示しています。
PostscriptFontName
は、フォント名の前に追加の文字を返します。フォントサブセットを埋め込むときに関係があると思います。
以下は、iTextSharp 5.1.1.0をターゲットとし、テキストをHTMLとして抽出する完全なWinFormsアプリケーションです。
サンプルPDFのスクリーンショット
HTMLとして抽出されたサンプルテキスト
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Hello </span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">w</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:37.87201">o</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">rl</span>
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">d </span>
<br />
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Test </span>
コード
using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text.pdf.parser;
using iTextSharp.text.pdf;
namespace WindowsFormsApplication2
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Document.pdf"));
TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
Console.WriteLine(F);
this.Close();
}
public class TextWithFontExtractionStategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
{
//HTML buffer
private StringBuilder result = new StringBuilder();
//Store last used properties
private Vector lastBaseLine;
private string lastFont;
private float lastFontSize;
//http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html
private enum TextRenderMode
{
FillText = 0,
StrokeText = 1,
FillThenStrokeText = 2,
Invisible = 3,
FillTextAndAddToPathForClipping = 4,
StrokeTextAndAddToPathForClipping = 5,
FillThenStrokeTextAndAddToPathForClipping = 6,
AddTextToPaddForClipping = 7
}
public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
{
string curFont = renderInfo.GetFont().PostscriptFontName;
//Check if faux bold is used
if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
{
curFont += "-Bold";
}
//This code assumes that if the baseline changes then we're on a newline
Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
Single curFontSize = rect.Height;
//See if something has changed, either the baseline, the font or the font size
if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
{
//if we've put down at least one span tag close it
if ((this.lastBaseLine != null))
{
this.result.AppendLine("</span>");
}
//If the baseline has changed then insert a line break
if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
{
this.result.AppendLine("<br />");
}
//Create an HTML tag with appropriate styles
this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize);
}
//Append the current text
this.result.Append(renderInfo.GetText());
//Set currently used properties
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}
public string GetResultantText()
{
//If we wrote anything then we'll always have a missing closing tag so close it here
if (result.Length > 0)
{
result.Append("</span>");
}
return result.ToString();
}
//Not needed
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo) { }
}
}
}
@ChrisコードをJavaに変換しました。
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import com.itextpdf.text.pdf.parser.Vector;
public class TextWithFontExtractionStategy implements TextExtractionStrategy {
//HTML buffer
private StringBuilder result = new StringBuilder();
//Store last used properties
private Vector lastBaseLine;
private String lastFont;
private float lastFontSize;
//http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html
private enum TextRenderMode
{
FillText(0),
StrokeText(1),
FillThenStrokeText(2),
Invisible(3),
FillTextAndAddToPathForClipping(4),
StrokeTextAndAddToPathForClipping(5),
FillThenStrokeTextAndAddToPathForClipping(6),
AddTextToPaddForClipping(7);
private int value;
TextRenderMode(int value) {
this.value = value;
}
public int getValue() {
return value;
}
}
public void renderText(TextRenderInfo renderInfo)
{
String curFont = renderInfo.getFont().getPostscriptFontName();
//Check if faux bold is used
if ((renderInfo.getTextRenderMode() == TextRenderMode.FillThenStrokeText.getValue()))
{
curFont += "-Bold";
}
//This code assumes that if the baseline changes then we're on a newline
Vector curBaseline = renderInfo.getBaseline().getStartPoint();
Vector topRight = renderInfo.getAscentLine().getEndPoint();
Rectangle rect = new Rectangle(curBaseline.get(Vector.I1), curBaseline.get(Vector.I2), topRight.get(Vector.I1), topRight.get(Vector.I2));
float curFontSize = rect.getHeight();
//See if something has changed, either the baseline, the font or the font size
if ((this.lastBaseLine == null) || (curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2)) || (curFontSize != lastFontSize) || (curFont != lastFont))
{
//if we've put down at least one span tag close it
if ((this.lastBaseLine != null))
{
this.result.append("</span>").append("\n");
}
//If the baseline has changed then insert a line break
if ((this.lastBaseLine != null) && curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2))
{
this.result.append("<br />").append("\n");
}
//Create an HTML tag with appropriate styles
this.result.append(String.format("<span style=\"font-family:{%s};font-size:{%s}\">", curFont, curFontSize));
}
//Append the current text
this.result.append(renderInfo.getText() + " ");
//Set currently used properties
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}
public String getResultantText()
{
//If we wrote anything then we'll always have a missing closing tag so close it here
if (result.length() > 0)
{
result.append("</span>");
}
return result.toString();
}
//Not needed
public void beginTextBlock() { }
public void endTextBlock() { }
public void renderImage(ImageRenderInfo renderInfo) { }
}