将来の使用のためにWord文書からテキストを取得する必要があるシナリオがあり、cv's/resumesでの検索のように、ユーザーがアップロードした文書内の文字列を検索し、ユーザーがWordドキュメントをアップロードした場合、いくつかの役立つリンクがありますが、問題全体を解決することはできません。アップロード時にテキストを取得し、データベースにテキストを保存する必要があり、データベース内で簡単に検索できます。
.doc/.docxに適切な仕事をする簡単なクラスは、 PHP docx reader:MS Word Docxファイルをtext 。
class DocxConversion{
private $filename;
public function __construct($filePath) {
$this->filename = $filePath;
}
private function read_doc() {
$fileHandle = fopen($this->filename, "r");
$line = @fread($fileHandle, filesize($this->filename));
$lines = explode(chr(0x0D),$line);
$outtext = "";
foreach($lines as $thisline)
{
$pos = strpos($thisline, chr(0x00));
if (($pos !== FALSE)||(strlen($thisline)==0))
{
} else {
$outtext .= $thisline." ";
}
}
$outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$outtext);
return $outtext;
}
private function read_docx(){
$striped_content = '';
$content = '';
$Zip = Zip_open($this->filename);
if (!$Zip || is_numeric($Zip)) return false;
while ($Zip_entry = Zip_read($Zip)) {
if (Zip_entry_open($Zip, $Zip_entry) == FALSE) continue;
if (Zip_entry_name($Zip_entry) != "Word/document.xml") continue;
$content .= Zip_entry_read($Zip_entry, Zip_entry_filesize($Zip_entry));
Zip_entry_close($Zip_entry);
}// end while
Zip_close($Zip);
$content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
$content = str_replace('</w:r></w:p>', "\r\n", $content);
$striped_content = strip_tags($content);
return $striped_content;
}
/************************Excel sheet************************************/
function xlsx_to_text($input_file){
$xml_filename = "xl/sharedStrings.xml"; //content file name
$Zip_handle = new ZipArchive;
$output_text = "";
if(true === $Zip_handle->open($input_file)){
if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index);
$xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output_text = strip_tags($xml_handle->saveXML());
}else{
$output_text .="";
}
$Zip_handle->close();
}else{
$output_text .="";
}
return $output_text;
}
/*************************power point files*****************************/
function pptx_to_text($input_file){
$Zip_handle = new ZipArchive;
$output_text = "";
if(true === $Zip_handle->open($input_file)){
$slide_number = 1; //loop through slide files
while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index);
$xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output_text .= strip_tags($xml_handle->saveXML());
$slide_number++;
}
if($slide_number == 1){
$output_text .="";
}
$Zip_handle->close();
}else{
$output_text .="";
}
return $output_text;
}
public function convertToText() {
if(isset($this->filename) && !file_exists($this->filename)) {
return "File Not exists";
}
$fileArray = pathinfo($this->filename);
$file_ext = $fileArray['extension'];
if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx")
{
if($file_ext == "doc") {
return $this->read_doc();
} elseif($file_ext == "docx") {
return $this->read_docx();
} elseif($file_ext == "xlsx") {
return $this->xlsx_to_text();
}elseif($file_ext == "pptx") {
return $this->pptx_to_text();
}
} else {
return "Invalid File Type";
}
}
}
Document_file_format Docファイルはバイナリblobです。これらは fopen を使用して読み取ることができますが、.docxファイルはZipファイルとxmlファイルです zipファイルコンテナー内のxmlファイル(ソースウィキペディア)Zip_open を使用してそれらを読み取ることができます。
上記のクラスの使用法
$docObj = new DocxConversion("test.doc");
//$docObj = new DocxConversion("test.docx");
//$docObj = new DocxConversion("test.xlsx");
//$docObj = new DocxConversion("test.pptx");
echo $docText= $docObj->convertToText();
From DOC file
$filename = 'ypue file';
if ( file_exists($filename) ) {
if ( ($fh = fopen($filename, 'r')) !== false ) {
$headers = fread($fh, 0xA00);
$n1 = ( ord($headers[0x21C]) - 1 );
$n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 );
$n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 );
$n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 );
$textLength = ($n1 + $n2 + $n3 + $n4);
$extracted_plaintext = fread($fh, $textLength);
echo nl2br($extracted_plaintext);
print_r(extract_emails_from($extracted_plaintext));
}
}
function extract_emails_from($string) {
preg_match_all("/[\._a-zA-Z0-9-]+@[\._a-zA-Z0-9-]+/i", $string, $matches);
return $matches[0];
}
DOCXから:
/*Name of the document file*/
$document = 'your file';
/**Function to extract text*/
function extracttext($filename) {
//Check for extension
$ext = end(explode('.', $filename));
//if its docx file
if($ext == 'docx')
$dataFile = "Word/document.xml";
//else it must be odt file
else
$dataFile = "content.xml";
//Create a new Zip archive object
$Zip = new ZipArchive;
// Open the archive file
if (true === $Zip->open($filename)) {
// If successful, search for the data file in the archive
if (($index = $Zip->locateName($dataFile)) !== false) {
// Index found! Now read it to a string
$text = $Zip->getFromIndex($index);
// Load XML from a string
// Ignore errors and warnings
$xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
// Remove XML formatting tags and return the text
return strip_tags($xml->saveXML());
}
//Close the archive file
$Zip->close();
}
// In case of failure return a message
return "File not found";
}
echo extracttext($document);
// DOCXの場合、空白を保持したい場合は、テーブルtrとtcにも注意し、以下のコードを使用します:好みに合わせて変更します。 Cosそれは、リモートまたはローカルからファイルをダウンロードします
//=========DOCX===========
function extractDocxText($url,$file_name){
$docx = get_url($url);
file_put_contents("tempf.docx",$docx);
$xml_filename = "Word/document.xml"; //content file name
$Zip_handle = new ZipArchive;
$output_text = "";
if(true === $Zip_handle->open("tempf.docx")){
if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index);
//file_put_contents($input_file.".xml",$xml_datas);
$replace_newlines = preg_replace('/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/',"\n\r",$xml_datas);
$replace_tableRows = preg_replace('/<w:tr>/',"\n\r",$replace_newlines);
$replace_tab = preg_replace('/<w:tab\/>/',"\t",$replace_tableRows);
$replace_paragraphs = preg_replace('/<\/w:p>/',"\n\r",$replace_tab);
$replace_other_Tags = strip_tags($replace_paragraphs);
$output_text = $replace_other_Tags;
}else{
$output_text .="";
}
$Zip_handle->close();
}else{
$output_text .=" ";
}
chmod("tempf.docx", 0777); unlink(realpath("tempf.docx"));
//save to file or echo content
file_put_contents($file_name,$output_text);
echo $output_text;
}
//========PDF===========
//Requires installation in your Linux server
//Sudo su
//apt-get install xpdf
function extractPdfText($url,$PDF_fullpath_or_Filename){
$pdf = get_url($url);
file_put_contents ("temppdf.txt", $pdf);
$content = pdf2text("temppdf.txt");
chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt"));
echo $content;
file_put_contents($PDF_fullpath_or_Filename,$content);
}
//========DOC==========
function extractDocText($url,$file_name){
$doc = get_url($url);
file_put_contents ("tempf.txt", $doc);
$fileHandle = fopen("tempf.txt", "r");
$line = @fread($fileHandle, filesize("tempf.txt"));
$lines = explode(chr(0x0D),$line);
$outtext = "";
foreach($lines as $thisline){
$pos = strpos($thisline, chr(0x00));
if (($pos !== FALSE)||(strlen($thisline)==0))
{} else {$outtext .= $thisline."\n\r";}
}
$content = preg_replace('/[a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/',' ',$outtext);
//chmod("tempf.txt", 0777); unlink(realpath("tempf.txt"));
echo $content;
file_put_contents($file_name,$content);
}
//========XLSX==========
function extractXlsxText($url,$file_name){
$xlsx = get_url($url);
file_put_contents ("tempf.txt", $xlsx);
$content = "";
$dir = 'tempforxlsx';
// Unzip
$Zip = new ZipArchive();
$Zip->open("tempf.txt");
$Zip->extractTo($dir);
// Open up shared strings & the first worksheet
$strings = simplexml_load_file($dir . '/xl/sharedStrings.xml');
$sheet = simplexml_load_file($dir . '/xl/worksheets/sheet1.xml');
// Parse the rows
$xlrows = $sheet->sheetData->row;
foreach ($xlrows as $xlrow) {
$arr = array();
// In each row, grab it's value
foreach ($xlrow->c as $cell) {
$v = (string) $cell->v;
// If it has a "t" (type?) of "s" (string?), use the value to look up string value
if (isset($cell['t']) && $cell['t'] == 's') {
$s = array();
$si = $strings->si[(int) $v];
// Register & alias the default namespace or you'll get empty results in the xpath query
$si->registerXPathNamespace('n', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main');
// Cat together all of the 't' (text?) node values
foreach($si->xpath('.//n:t') as $t) {
$content .= $t." ";} }
}
}
echo $content;
file_put_contents($file_name,$content);
}
//========PPT==========
function extractPptText($url,$file_name){
$ppt = file_get_contents($url);
file_put_contents ("tempf.ppt", $ppt);
$fileHandle = fopen("tempf.ppt", "r");
$line = @fread($fileHandle, filesize("tempf.ppt"));
$lines = explode(chr(0x0f),$line);
$outtext = '';
foreach($lines as $thisline) {
if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) {
$text_line = substr($thisline, 4);
$end_pos = strpos($text_line, chr(0x00));
$text_line = substr($text_line, 0, $end_pos);
$text_line = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/'," ",$text_line);
$outtext = substr($text_line, 0, $end_pos)."\n".$outtext;
}
}
//echo $outtext;
file_put_contents($file_name,$outtext);
}
//========PPTX==========
function extractPptxText($url,$file_name){
$xls = get_url($url);
file_put_contents ("tempf.txt", $xls);
$Zip_handle = new ZipArchive;
$output_text = ' ';
if(true === $Zip_handle->open("tempf.txt")){
$slide_number = 1; //loop through slide files
while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index); // these four lines of codes
// below were
$xml_handle = new DOMDocument (); // added by me in order
$xml_handle->preserveWhiteSpace = true; // to preserve space between
$xml_handle->formatOutput = true; // each read data
$xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output_text .= $xml_handle->saveXML();
$slide_number++;
}
if($slide_number == 1){
$output_text .= "";
}
$Zip_handle->close();
}else{
$output_text .= "";
}
echo $output_text;
file_put_contents($file_name,$output_text);
}
/*
==========================================================================
=========================================================================
And below is get_url() function: Better than fie_get_contents();
*/
function get_url( $url,$timeout = 5 )
{
$url = str_replace( "&", "&", urldecode(trim($url)) );
$ch = curl_init();
curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
curl_setopt( $ch, CURLOPT_ENCODING, "" );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); # required for https urls
curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
$content = curl_exec( $ch );
//$response = curl_getinfo( $ch );
curl_close ( $ch );
return $content;
}
Docxドキュメントの場合、docx2txt
ツールの使用をお勧めします(少なくともDebian/Ubuntuで利用可能):
docx2txt < your_file.docx
README
は、vimと統合する方法を説明します。 .vimrc
に追加します:
" use docx2txt.pl to allow VIm to view the text content of a .docx file directly.
autocmd BufReadPre *.docx set ro
autocmd BufReadPost *.docx %!docx2txt
(emacsと統合する方法も説明します)。
ハッカーのために、このツールはPerlで書かれています。