ファイルヘッダーに基づいてdoc、docx、pdf、xls、xlsxを識別する方法

Question

C＃のファイルヘッダーに基づいてdoc、docx、pdf、xls、xlsxを識別する方法この2つはどちらも操作できるため、ファイル拡張子MimeMapping.GetMimeMappingに依存したくありません。

ヘッダーの読み方は知っていますが、ファイルがdoc、docx、pdf、xls、xlsxの場合、バイトの組み合わせで何と言えるかわかりません。何かご意見は？

Alex · Answer

この質問には、ファイルの最初のバイトを使用してファイルタイプを判別する例が含まれています。。NETを使用して、拡張子ではなくファイル署名に基づいてファイルのMIMEタイプをどのように見つけることができますか

これは非常に長い投稿なので、以下の関連する回答を投稿します。

public class MimeType { private static readonly byte[] BMP = { 66, 77 }; private static readonly byte[] DOC = { 208, 207, 17, 224, 161, 177, 26, 225 }; private static readonly byte[] EXE_DLL = { 77, 90 }; private static readonly byte[] GIF = { 71, 73, 70, 56 }; private static readonly byte[] ICO = { 0, 0, 1, 0 }; private static readonly byte[] JPG = { 255, 216, 255 }; private static readonly byte[] MP3 = { 255, 251, 48 }; private static readonly byte[] OGG = { 79, 103, 103, 83, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0 }; private static readonly byte[] PDF = { 37, 80, 68, 70, 45, 49, 46 }; private static readonly byte[] PNG = { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 }; private static readonly byte[] RAR = { 82, 97, 114, 33, 26, 7, 0 }; private static readonly byte[] SWF = { 70, 87, 83 }; private static readonly byte[] TIFF = { 73, 73, 42, 0 }; private static readonly byte[] TORRENT = { 100, 56, 58, 97, 110, 110, 111, 117, 110, 99, 101 }; private static readonly byte[] TTF = { 0, 1, 0, 0, 0 }; private static readonly byte[] WAV_AVI = { 82, 73, 70, 70 }; private static readonly byte[] WMV_WMA = { 48, 38, 178, 117, 142, 102, 207, 17, 166, 217, 0, 170, 0, 98, 206, 108 }; private static readonly byte[] Zip_DOCX = { 80, 75, 3, 4 }; public static string GetMimeType(byte[] file, string fileName) { string mime = "application/octet-stream"; //DEFAULT UNKNOWN MIME TYPE //Ensure that the filename isn't empty or null if (string.IsNullOrWhiteSpace(fileName)) { return mime; } //Get the file extension string extension = Path.GetExtension(fileName) == null ? string.Empty : Path.GetExtension(fileName).ToUpper(); //Get the MIME Type if (file.Take(2).SequenceEqual(BMP)) { mime = "image/bmp"; } else if (file.Take(8).SequenceEqual(DOC)) { mime = "application/msword"; } else if (file.Take(2).SequenceEqual(EXE_DLL)) { mime = "application/x-msdownload"; //both use same mime type } else if (file.Take(4).SequenceEqual(GIF)) { mime = "image/gif"; } else if (file.Take(4).SequenceEqual(ICO)) { mime = "image/x-icon"; } else if (file.Take(3).SequenceEqual(JPG)) { mime = "image/jpeg"; } else if (file.Take(3).SequenceEqual(MP3)) { mime = "audio/mpeg"; } else if (file.Take(14).SequenceEqual(OGG)) { if (extension == ".OGX") { mime = "application/ogg"; } else if (extension == ".OGA") { mime = "audio/ogg"; } else { mime = "video/ogg"; } } else if (file.Take(7).SequenceEqual(PDF)) { mime = "application/pdf"; } else if (file.Take(16).SequenceEqual(PNG)) { mime = "image/png"; } else if (file.Take(7).SequenceEqual(RAR)) { mime = "application/x-rar-compressed"; } else if (file.Take(3).SequenceEqual(SWF)) { mime = "application/x-shockwave-flash"; } else if (file.Take(4).SequenceEqual(TIFF)) { mime = "image/tiff"; } else if (file.Take(11).SequenceEqual(TORRENT)) { mime = "application/x-bittorrent"; } else if (file.Take(5).SequenceEqual(TTF)) { mime = "application/x-font-ttf"; } else if (file.Take(4).SequenceEqual(WAV_AVI)) { mime = extension == ".AVI" ? "video/x-msvideo" : "audio/x-wav"; } else if (file.Take(16).SequenceEqual(WMV_WMA)) { mime = extension == ".WMA" ? "audio/x-ms-wma" : "video/x-ms-wmv"; } else if (file.Take(4).SequenceEqual(Zip_DOCX)) { mime = extension == ".DOCX" ? "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : "application/x-Zip-compressed"; } return mime; }

user2173353 · Answer

ファイル署名を使用することはそれほど現実的ではありません（新しいOffice形式はZipファイルであり、古いOfficeファイルはOLE CF/OLE SSコンテナー）であるため）、 C＃コードを使用してそれらを読み取り、それらが何であるかを理解できます。

最新のOffice形式については、System.IO.Packagingを使用して（DOCX/PPTX/XLSX/...）Zipファイルを読み取ることができます。 https://msdn.Microsoft.com/en-us/library/ms568187 （v = vs.110）.aspx そうすることで、最初のドキュメントパーツのContentTypeを見つけ、それを使用して推測できます。

古いOfficeファイル（Office 2003）の場合、このライブラリを使用して、内容に基づいてファイルを区別できます（MSIおよびMSGファイルもこのファイル形式を使用していることに注意してください）： http： //sourceforge.net/projects/openmcdf/

たとえば、XLSファイルの内容は次のとおりです。

これが役に立てば幸いです！ :)

もし私がこの答えを以前に見つけていたら、それは確かに私を助けたでしょう。 ;）

rymdsmurf · Answer

OPがOfficeファイル形式について具体的に言及している場合、user2173353からの回答が最も正確です。ただし、ライブラリ全体（OpenMCDF）を追加してレガシーOffice形式を特定するという考えは好きではなかったので、これを行うための独自のルーチンを作成しました。

 public static CfbFileFormat GetCfbFileFormat(Stream fileData) { if (!fileData.CanSeek) throw new ArgumentException("Data stream must be seekable.", nameof(fileData)); try { // Notice that values in a CFB files are always little-endian. Fortunately BinaryReader.ReadUInt16/ReadUInt32 reads with little-endian. // If using .net < 4.5 this BinaryReader constructor is not available. Use a simpler one but remember to also remove the 'using' statement. using (BinaryReader reader = new BinaryReader(fileData, Encoding.Unicode, true)) { // Check that data has the CFB file header var header = reader.ReadBytes(8); if (!header.SequenceEqual(new byte[] {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1})) return CfbFileFormat.Unknown; // Get sector size (2 byte uint) at offset 30 in the header // Value at 1C specifies this as the power of two. The only valid values are 9 or 12, which gives 512 or 4096 byte sector size. fileData.Position = 30; ushort readUInt16 = reader.ReadUInt16(); int sectorSize = 1 << readUInt16; // Get first directory sector index at offset 48 in the header fileData.Position = 48; var rootDirectoryIndex = reader.ReadUInt32(); // File header is one sector wide. After that we can address the sector directly using the sector index var rootDirectoryAddress = sectorSize + (rootDirectoryIndex * sectorSize); // Object type field is offset 80 bytes into the directory sector. It is a 128 bit GUID, encoded as "DWORD, Word, Word, BYTE[8]". fileData.Position = rootDirectoryAddress + 80; var bits127_96 = reader.ReadInt32(); var bits95_80 = reader.ReadInt16(); var bits79_64 = reader.ReadInt16(); var bits63_0 = reader.ReadBytes(8); var guid = new Guid(bits127_96, bits95_80, bits79_64, bits63_0); // Compare to known file format GUIDs CfbFileFormat result; return Formats.TryGetValue(guid, out result) ? result : CfbFileFormat.Unknown; } } catch (IOException) { return CfbFileFormat.Unknown; } catch (OverflowException) { return CfbFileFormat.Unknown; } } public enum CfbFileFormat { Doc, Xls, Msi, Ppt, Unknown } private static readonly Dictionary<Guid, CfbFileFormat> Formats = new Dictionary<Guid, CfbFileFormat> { {Guid.Parse("{00020810-0000-0000-c000-000000000046}"), CfbFileFormat.Xls}, {Guid.Parse("{00020820-0000-0000-c000-000000000046}"), CfbFileFormat.Xls}, {Guid.Parse("{00020906-0000-0000-c000-000000000046}"), CfbFileFormat.Doc}, {Guid.Parse("{000c1084-0000-0000-c000-000000000046}"), CfbFileFormat.Msi}, {Guid.Parse("{64818d10-4f9b-11cf-86ea-00aa00b929e8}"), CfbFileFormat.Ppt} };

必要に応じて、フォーマット識別子を追加できます。

私はこれを.docと.xlsで試しましたが、うまくいきました。私は、4096バイトのセクターサイズを使用するCFBファイルでをテストしていません。どこにあるかさえわからないためです。

コードは、次のドキュメントの情報に基づいています。

user369142 · Answer

user2173353は、新しいOffice .docx/.xlsx形式を検出するための正しいソリューションのようです。これにいくつかの詳細を追加するために、これらを正しく識別するために以下のチェックが表示されます。

 /// <summary> /// MS .docx, .xslx and other extensions are (correctly) identified as Zip files using signature lookup. /// This tests if System.IO.Packaging is able to open, and if package has parts, this is not a Zip file. /// </summary> /// <param name="stream"></param> /// <returns></returns> private static bool IsPackage(this Stream stream) { Package package = Package.Open(stream, FileMode.Open, FileAccess.Read); return package.GetParts().Any(); }