Wordファイルからテキストを抽出する方法.doc、docx、.xlsx、.pptx php

Question

将来の使用のためにWord文書からテキストを取得する必要があるシナリオがあり、cv's/resumesでの検索のように、ユーザーがアップロードした文書内の文字列を検索し、ユーザーがWordドキュメントをアップロードした場合、いくつかの役立つリンクがありますが、問題全体を解決することはできません。アップロード時にテキストを取得し、データベースにテキストを保存する必要があり、データベース内で簡単に検索できます。

M Khalid Junaid · Accepted Answer

.doc/.docxに適切な仕事をする簡単なクラスは、 PHP docx reader：MS Word Docxファイルをtext 。

 class DocxConversion{ private $filename; public function __construct($filePath) { $this->filename = $filePath; } private function read_doc() { $fileHandle = fopen($this->filename, "r"); $line = @fread($fileHandle, filesize($this->filename)); $lines = explode(chr(0x0D),$line); $outtext = ""; foreach($lines as $thisline) { $pos = strpos($thisline, chr(0x00)); if (($pos !== FALSE)||(strlen($thisline)==0)) { } else { $outtext .= $thisline." "; } } $outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@/\_]/","",$outtext); return $outtext; } private function read_docx(){ $striped_content = ''; $content = ''; $Zip = Zip_open($this->filename); if (!$Zip || is_numeric($Zip)) return false; while ($Zip_entry = Zip_read($Zip)) { if (Zip_entry_open($Zip, $Zip_entry) == FALSE) continue; if (Zip_entry_name($Zip_entry) != "Word/document.xml") continue; $content .= Zip_entry_read($Zip_entry, Zip_entry_filesize($Zip_entry)); Zip_entry_close($Zip_entry); }// end while Zip_close($Zip); $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content); $content = str_replace('</w:r></w:p>', "\r\n", $content); $striped_content = strip_tags($content); return $striped_content; } /************************Excel sheet************************************/ function xlsx_to_text($input_file){ $xml_filename = "xl/sharedStrings.xml"; //content file name $Zip_handle = new ZipArchive; $output_text = ""; if(true === $Zip_handle->open($input_file)){ if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){ $xml_datas = $Zip_handle->getFromIndex($xml_index); $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text = strip_tags($xml_handle->saveXML()); }else{ $output_text .=""; } $Zip_handle->close(); }else{ $output_text .=""; } return $output_text; } /*************************power point files*****************************/ function pptx_to_text($input_file){ $Zip_handle = new ZipArchive; $output_text = ""; if(true === $Zip_handle->open($input_file)){ $slide_number = 1; //loop through slide files while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ $xml_datas = $Zip_handle->getFromIndex($xml_index); $xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text .= strip_tags($xml_handle->saveXML()); $slide_number++; } if($slide_number == 1){ $output_text .=""; } $Zip_handle->close(); }else{ $output_text .=""; } return $output_text; } public function convertToText() { if(isset($this->filename) && !file_exists($this->filename)) { return "File Not exists"; } $fileArray = pathinfo($this->filename); $file_ext = $fileArray['extension']; if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx") { if($file_ext == "doc") { return $this->read_doc(); } elseif($file_ext == "docx") { return $this->read_docx(); } elseif($file_ext == "xlsx") { return $this->xlsx_to_text(); }elseif($file_ext == "pptx") { return $this->pptx_to_text(); } } else { return "Invalid File Type"; } } }

Document_file_format Docファイルはバイナリblobです。これらは fopen を使用して読み取ることができますが、.docxファイルはZipファイルとxmlファイルです zipファイルコンテナー内のxmlファイル（ソースウィキペディア） Zip_open を使用してそれらを読み取ることができます。

上記のクラスの使用法

$docObj = new DocxConversion("test.doc"); //$docObj = new DocxConversion("test.docx"); //$docObj = new DocxConversion("test.xlsx"); //$docObj = new DocxConversion("test.pptx"); echo $docText= $docObj->convertToText();

Jumper Pot · Answer

From DOC file

$filename = 'ypue file'; if ( file_exists($filename) ) { if ( ($fh = fopen($filename, 'r')) !== false ) { $headers = fread($fh, 0xA00); $n1 = ( ord($headers[0x21C]) - 1 ); $n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 ); $n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 ); $n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 ); $textLength = ($n1 + $n2 + $n3 + $n4); $extracted_plaintext = fread($fh, $textLength); echo nl2br($extracted_plaintext); print_r(extract_emails_from($extracted_plaintext)); } } function extract_emails_from($string) { preg_match_all("/[\._a-zA-Z0-9-]+@[\._a-zA-Z0-9-]+/i", $string, $matches); return $matches[0]; }

DOCXから：

 /*Name of the document file*/ $document = 'your file'; /**Function to extract text*/ function extracttext($filename) { //Check for extension $ext = end(explode('.', $filename)); //if its docx file if($ext == 'docx') $dataFile = "Word/document.xml"; //else it must be odt file else $dataFile = "content.xml"; //Create a new Zip archive object $Zip = new ZipArchive; // Open the archive file if (true === $Zip->open($filename)) { // If successful, search for the data file in the archive if (($index = $Zip->locateName($dataFile)) !== false) { // Index found! Now read it to a string $text = $Zip->getFromIndex($index); // Load XML from a string // Ignore errors and warnings $xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); // Remove XML formatting tags and return the text return strip_tags($xml->saveXML()); } //Close the archive file $Zip->close(); } // In case of failure return a message return "File not found"; } echo extracttext($document);

FRanklinDavid · Answer

// DOCXの場合、空白を保持したい場合は、テーブルtrとtcにも注意し、以下のコードを使用します：好みに合わせて変更します。 Cosそれは、リモートまたはローカルからファイルをダウンロードします

//=========DOCX=========== function extractDocxText($url,$file_name){ $docx = get_url($url); file_put_contents("tempf.docx",$docx); $xml_filename = "Word/document.xml"; //content file name $Zip_handle = new ZipArchive; $output_text = ""; if(true === $Zip_handle->open("tempf.docx")){ if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){ $xml_datas = $Zip_handle->getFromIndex($xml_index); //file_put_contents($input_file.".xml",$xml_datas); $replace_newlines = preg_replace('/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/',"\n\r",$xml_datas); $replace_tableRows = preg_replace('/<w:tr>/',"\n\r",$replace_newlines); $replace_tab = preg_replace('/<w:tab/>/',"\t",$replace_tableRows); $replace_paragraphs = preg_replace('/</w:p>/',"\n\r",$replace_tab); $replace_other_Tags = strip_tags($replace_paragraphs); $output_text = $replace_other_Tags; }else{ $output_text .=""; } $Zip_handle->close(); }else{ $output_text .=" "; } chmod("tempf.docx", 0777); unlink(realpath("tempf.docx")); //save to file or echo content file_put_contents($file_name,$output_text); echo $output_text; } //========PDF=========== //Requires installation in your Linux server //Sudo su //apt-get install xpdf function extractPdfText($url,$PDF_fullpath_or_Filename){ $pdf = get_url($url); file_put_contents ("temppdf.txt", $pdf); $content = pdf2text("temppdf.txt"); chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt")); echo $content; file_put_contents($PDF_fullpath_or_Filename,$content); } //========DOC========== function extractDocText($url,$file_name){ $doc = get_url($url); file_put_contents ("tempf.txt", $doc); $fileHandle = fopen("tempf.txt", "r"); $line = @fread($fileHandle, filesize("tempf.txt")); $lines = explode(chr(0x0D),$line); $outtext = ""; foreach($lines as $thisline){ $pos = strpos($thisline, chr(0x00)); if (($pos !== FALSE)||(strlen($thisline)==0)) {} else {$outtext .= $thisline."\n\r";} } $content = preg_replace('/[a-zA-Z0-9\s\,\.\-\n\r\t@/\_]/',' ',$outtext); //chmod("tempf.txt", 0777); unlink(realpath("tempf.txt")); echo $content; file_put_contents($file_name,$content); } //========XLSX========== function extractXlsxText($url,$file_name){ $xlsx = get_url($url); file_put_contents ("tempf.txt", $xlsx); $content = ""; $dir = 'tempforxlsx'; // Unzip $Zip = new ZipArchive(); $Zip->open("tempf.txt"); $Zip->extractTo($dir); // Open up shared strings & the first worksheet $strings = simplexml_load_file($dir . '/xl/sharedStrings.xml'); $sheet = simplexml_load_file($dir . '/xl/worksheets/sheet1.xml'); // Parse the rows $xlrows = $sheet->sheetData->row; foreach ($xlrows as $xlrow) { $arr = array(); // In each row, grab it's value foreach ($xlrow->c as $cell) { $v = (string) $cell->v; // If it has a "t" (type?) of "s" (string?), use the value to look up string value if (isset($cell['t']) && $cell['t'] == 's') { $s = array(); $si = $strings->si[(int) $v]; // Register & alias the default namespace or you'll get empty results in the xpath query $si->registerXPathNamespace('n', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'); // Cat together all of the 't' (text?) node values foreach($si->xpath('.//n:t') as $t) { $content .= $t." ";} } } } echo $content; file_put_contents($file_name,$content); } //========PPT========== function extractPptText($url,$file_name){ $ppt = file_get_contents($url); file_put_contents ("tempf.ppt", $ppt); $fileHandle = fopen("tempf.ppt", "r"); $line = @fread($fileHandle, filesize("tempf.ppt")); $lines = explode(chr(0x0f),$line); $outtext = ''; foreach($lines as $thisline) { if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) { $text_line = substr($thisline, 4); $end_pos = strpos($text_line, chr(0x00)); $text_line = substr($text_line, 0, $end_pos); $text_line = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\t@/\_]/'," ",$text_line); $outtext = substr($text_line, 0, $end_pos)."\n".$outtext; } } //echo $outtext; file_put_contents($file_name,$outtext); } //========PPTX========== function extractPptxText($url,$file_name){ $xls = get_url($url); file_put_contents ("tempf.txt", $xls); $Zip_handle = new ZipArchive; $output_text = ' '; if(true === $Zip_handle->open("tempf.txt")){ $slide_number = 1; //loop through slide files while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){ $xml_datas = $Zip_handle->getFromIndex($xml_index); // these four lines of codes // below were $xml_handle = new DOMDocument (); // added by me in order $xml_handle->preserveWhiteSpace = true; // to preserve space between $xml_handle->formatOutput = true; // each read data $xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING); $output_text .= $xml_handle->saveXML(); $slide_number++; } if($slide_number == 1){ $output_text .= ""; } $Zip_handle->close(); }else{ $output_text .= ""; } echo $output_text; file_put_contents($file_name,$output_text); } /* ========================================================================== ========================================================================= And below is get_url() function: Better than fie_get_contents(); */ function get_url( $url,$timeout = 5 ) { $url = str_replace( "&amp;", "&", urldecode(trim($url)) ); $ch = curl_init(); curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" ); curl_setopt( $ch, CURLOPT_URL, $url ); curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true ); curl_setopt( $ch, CURLOPT_ENCODING, "" ); curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true ); curl_setopt( $ch, CURLOPT_AUTOREFERER, true ); curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); # required for https urls curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout ); curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout ); curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 ); $content = curl_exec( $ch ); //$response = curl_getinfo( $ch ); curl_close ( $ch ); return $content; }

Jezz · Answer

Docxドキュメントの場合、docx2txtツールの使用をお勧めします（少なくともDebian/Ubuntuで利用可能）：

docx2txt < your_file.docx

READMEは、vimと統合する方法を説明します。 .vimrcに追加します：

" use docx2txt.pl to allow VIm to view the text content of a .docx file directly. autocmd BufReadPre *.docx set ro autocmd BufReadPost *.docx %!docx2txt

（emacsと統合する方法も説明します）。

ハッカーのために、このツールはPerlで書かれています。