HTMLドキュメント全体をInternet Explorerのドキュメントフラグメントにロードできますか？

Question

これは私が少し苦労してきたものです。ユーザーがリモートWebページをフェッチし、その結果のページでフォームを検索できるようにする必要があるローカルのクライアント側スクリプトがあります。これを行うには（正規表現なしで）、ドキュメントを完全に通過可能なDOMオブジェクトに解析する必要があります。

私が強調したいいくつかの制限：

ライブラリ（jQueryなど）を使用したくありません。私がここでやらなければならないことに対して、膨らみが多すぎます。
（セキュリティ上の理由から）どのような状況でもリモートページのスクリプトを実行しないでください。
getElementsByTagNameなどのDOM APIが利用可能である必要があります。
Internet Explorerでのみ動作する必要がありますが、少なくとも7では動作します。
サーバーにアクセスできないふりをしてみましょう。私はそうしますが、これには使用できません。

私が試したこと

変数htmlに完全なHTMLドキュメント文字列（DOCTYPE宣言を含む）があるとすると、これまでに試したのは次のとおりです。

var frag = document.createDocumentFragment(), div = frag.appendChild(document.createElement("div")); div.outerHTML = html; //-> results in an empty fragment div.insertAdjacentHTML("afterEnd", html); //-> HTML is not added to the fragment div.innerHTML = html; //-> Error (expected, but I tried it anyway) var doc = new ActiveXObject("htmlfile"); doc.write(html); doc.close(); //-> JavaScript executes

また、HTMLから<head>および<body>nodeを抽出し、フラグメント内の<HTML>要素に追加しようとしましたが、まだうまくいきません。

誰かアイデアはありますか？

Rob W · Accepted Answer

フィドル： http://jsfiddle.net/JFSKe/6/

DocumentFragment DOMメソッドを実装していません。 _document.createElement_をinnerHTMLと組み合わせて使用すると、_<head>_および_<body>_タグが削除されます（作成された要素がルート要素である場合でも、_<html>_）。したがって、別の場所で解決策を探す必要があります。 cross-browser string-to-DOM functionを作成しました。これは、非表示のインラインフレームを利用しています。

すべての外部リソースとスクリプトは無効になります。詳細については、コードの説明を参照してください。

コード

_/* @param String html The string with HTML which has be converted to a DOM object @param func callback (optional) Callback(HTMLDocument doc, function destroy) @returns undefined if callback exists, else: Object HTMLDocument doc DOM fetched from Parameter:html function destroy Removes HTMLDocument doc. */ function string2dom(html, callback){ /* Sanitise the string */ html = sanitiseHTML(html); /*Defined at the bottom of the answer*/ /* Create an IFrame */ var iframe = document.createElement("iframe"); iframe.style.display = "none"; document.body.appendChild(iframe); var doc = iframe.contentDocument || iframe.contentWindow.document; doc.open(); doc.write(html); doc.close(); function destroy(){ iframe.parentNode.removeChild(iframe); } if(callback) callback(doc, destroy); else return {"doc": doc, "destroy": destroy}; } /* @name sanitiseHTML @param String html A string representing HTML code @return String A new string, fully stripped of external resources. All "external" attributes (href, src) are prefixed by data- */ function sanitiseHTML(html){ /* Adds a <!-\"'--> before every matched tag, so that unterminated quotes aren't preventing the browser from splitting a tag. Test case: '<input style="foo;b:url(0);><input onclick="<input type=button onclick="too() href=;>">' */ var prefix = "<!--\"'-->"; /*Attributes should not be prefixed by these characters. This list is not complete, but will be sufficient for this function. (see http://www.w3.org/TR/REC-xml/#NT-NameChar) */ var att = "[^-a-z0-9:._]"; var tag = "<[a-z]"; var any = "(?:[^<>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^<>]*"; var etag = "(?:>|(?=<))"; /* @name ae @description Converts a given string in a sequence of the original input and the HTML entity @param String string String to convert */ var entityEnd = "(?:;|(?!\d))"; var ents = {" ":"(?:\s|&nbsp;?|&#0*32"+entityEnd+"|&#x0*20"+entityEnd+")", "(":"(?:$|&#0*40"+entityEnd+"|&#x0*28"+entityEnd+")", ")":"(?:$|&#0*41"+entityEnd+"|&#x0*29"+entityEnd+")", ".":"(?:\.|&#0*46"+entityEnd+"|&#x0*2e"+entityEnd+")"}; /*Placeholder to avoid tricky filter-circumventing methods*/ var charMap = {}; var s = ents[" "]+"*"; /* Short-hand space */ /* Important: Must be pre- and postfixed by < and >. RE matches a whole tag! */ function ae(string){ var all_chars_lowercase = string.toLowerCase(); if(ents[string]) return ents[string]; var all_chars_uppercase = string.toUpperCase(); var RE_res = ""; for(var i=0; i<string.length; i++){ var char_lowercase = all_chars_lowercase.charAt(i); if(charMap[char_lowercase]){ RE_res += charMap[char_lowercase]; continue; } var char_uppercase = all_chars_uppercase.charAt(i); var RE_sub = [char_lowercase]; RE_sub.Push("&#0*" + char_lowercase.charCodeAt(0) + entityEnd); RE_sub.Push("&#x0*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd); if(char_lowercase != char_uppercase){ RE_sub.Push("&#0*" + char_uppercase.charCodeAt(0) + entityEnd); RE_sub.Push("&#x0*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd); } RE_sub = "(?:" + RE_sub.join("|") + ")"; RE_res += (charMap[char_lowercase] = RE_sub); } return(ents[string] = RE_res); } /* @name by @description second argument for the replace function. */ function by(match, group1, group2){ /* Adds a data-prefix before every external pointer */ return group1 + "data-" + group2 } /* @name cr @description Selects a HTML element and performs a search-and-replace on attributes @param String selector HTML substring to match @param String attribute RegExp-escaped; HTML element attribute to match @param String marker Optional RegExp-escaped; marks the prefix @param String delimiter Optional RegExp escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> when quotes are missing */ function cr(selector, attribute, marker, delimiter, end){ if(typeof selector == "string") selector = new RegExp(selector, "gi"); marker = typeof marker == "string" ? marker : "\s*="; delimiter = typeof delimiter == "string" ? delimiter : ""; end = typeof end == "string" ? end : ""; var is_end = end && "?"; var re1 = new RegExp("("+att+")("+attribute+marker+"(?:\s*\"[^\""+delimiter+"]*\"|\s*'[^'"+delimiter+"]*'|[^\s"+delimiter+"]+"+is_end+")"+end+")", "gi"); html = html.replace(selector, function(match){ return prefix + match.replace(re1, by); }); } /* @name cri @description Selects an attribute of a HTML element, and performs a search-and-replace on certain values @param String selector HTML element to match @param String attribute RegExp-escaped; HTML element attribute to match @param String front RegExp-escaped; attribute value, prefix to match @param String flags Optional RegExp flags, default "gi" @param String delimiter Optional RegExp-escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> when quotes are missing */ function cri(selector, attribute, front, flags, delimiter, end){ if(typeof selector == "string") selector = new RegExp(selector, "gi"); flags = typeof flags == "string" ? flags : "gi"; var re1 = new RegExp("("+att+attribute+"\s*=)((?:\s*\"[^\"]*\"|\s*'[^']*'|[^\s>]+))", "gi"); end = typeof end == "string" ? end + ")" : ")"; var at1 = new RegExp('(")('+front+'[^"]+")', flags); var at2 = new RegExp("(')("+front+"[^']+')", flags); var at3 = new RegExp("()("+front+'(?:"[^"]+"|\'[^\']+\'|(?:(?!'+delimiter+').)+)'+end, flags); var handleAttr = function(match, g1, g2){ if(g2.charAt(0) == '"') return g1+g2.replace(at1, by); if(g2.charAt(0) == "'") return g1+g2.replace(at2, by); return g1+g2.replace(at3, by); }; html = html.replace(selector, function(match){ return prefix + match.replace(re1, handleAttr); }); } /* <meta http-equiv=refresh content=" ; url= " > */ html = html.replace(new RegExp("<meta"+any+att+"http-equiv\s*=\s*(?:\""+ae("refresh")+"\""+any+etag+"|'"+ae("refresh")+"'"+any+etag+"|"+ae("refresh")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "gi"), "<!-- meta http-equiv=refresh stripped-->"); /* Stripping all scripts */ html = html.replace(new RegExp("<script"+any+">\s*//\s*<\[CDATA\[[\S\s]*?]]>\s*</script[^>]*>", "gi"), "<!--CDATA script-->"); html = html.replace(/<script[\S\s]+?</script\s*>/gi, "<!--Non-CDATA script-->"); cr(tag+any+att+"on[-a-z0-9:_.]+="+any+etag, "on[-a-z0-9:_.]+"); /* Event listeners */ cr(tag+any+att+"href\s*="+any+etag, "href"); /* Linked elements */ cr(tag+any+att+"src\s*="+any+etag, "src"); /* Embedded elements */ cr("<object"+any+att+"data\s*="+any+etag, "data"); /* <object data= > */ cr("<applet"+any+att+"codebase\s*="+any+etag, "codebase"); /* <applet codebase= > */ /* <param name=movie value= >*/ cr("<param"+any+att+"name\s*=\s*(?:\""+ae("movie")+"\""+any+etag+"|'"+ae("movie")+"'"+any+etag+"|"+ae("movie")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "value"); /* <style> and < style= > url()*/ cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:</style|$)/gi, "url", "\s*$\s*", "", "\s*$"); cri(tag+any+att+"style\s*="+any+etag, "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")")); /* IE7- CSS expression() */ cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:</style|$)/gi, "expression", "\s*$\s*", "", "\s*$"); cri(tag+any+att+"style\s*="+any+etag, "style", ae("expression")+s+ae("(")+s, 0, s+ae(")"), ae(")")); return html.replace(new RegExp("(?:"+prefix+")+", "g"), prefix); } _

コードの説明

sanitiseHTML関数は、私の_replace_all_rel_by_abs_関数に基づいています（この答えを参照）。最大の効率と信頼性を達成するために、sanitiseHTML関数は完全に書き直されています。

さらに、すべてのスクリプトとイベントハンドラー（CSS expression()、IE7-を含む）を削除するために、新しいRegExpのセットが追加されています。すべてのタグが期待どおりに解析されることを確認するために、調整済みタグの前には__が付けられます。この接頭辞は、ネストされた「イベントハンドラー」を終了しない引用符とともに正しく解析するために必要です：<a id="><input onclick="<div onmousemove=evil()>">。

これらのRegExpは、内部関数cr/cri（[〜＃〜] c [〜＃〜] reate [〜＃〜 ] r [〜＃〜] eplace [[〜＃〜] i [〜＃〜] nline]）。これらの関数は引数のリストを受け入れ、高度なRE置換を作成して実行します。 HTMLエンティティがRegExpを壊していないことを確認するために（refresh in _<meta http-equiv=refresh>_はさまざまな方法で記述できます）、動的に作成されたRegExpは関数ae（- [〜＃〜] a [〜＃〜] ny [〜＃〜] e [〜＃〜] ntity）。
実際の置換は、関数byによって行われます（replace by）。この実装では、byは、一致するすべての属性の前に_data-_を追加します。

すべての_<script>//<[CDATA[ .. //]]></script>_の発生はストライプ化されます。 CDATAセクションでは、コード内で_</script>_文字列を使用できるため、この手順が必要です。この置換が実行された後、次の置換に進んでも安全です。
残りの_<script>...</script>_タグは削除されます。
_<meta http-equiv=refresh .. >_タグが削除されました
Allイベントリスナーと外部ポインター/属性（href、src、url()）には、前に説明したように_data-_が前に付きます。
IFrameオブジェクトが作成されます。 IFrameはメモリをリークする可能性が低くなります（htmlfile ActiveXObjectとは異なります）。 IFrameは非表示になり、ドキュメントに追加されるため、DOMにアクセスできます。 document.write()は、HTMLをIFrameに書き込むために使用されます。 document.open()およびdocument.close()は、ドキュメントの以前のコンテンツを空にするために使用されます。これにより、生成されたドキュメントは、指定されたhtml文字列の正確なコピーになります。
コールバック関数が指定されている場合、関数は2つの引数で呼び出されます。 first引数は、生成されたdocumentオブジェクトへの参照です。 second引数は関数であり、呼び出されたときに生成されたDOMツリーを破棄します。この関数は、ツリーが不要になったときに呼び出す必要があります。
コールバック関数が指定されていない場合、関数は2つのプロパティ（docおよびdestroy）で構成されるオブジェクトを返します。これらのオブジェクトは、前述の引数と同じように動作します。

その他の注意事項

designModeプロパティを "On"に設定すると、フレームがスクリプトを実行しなくなります（Chromeではサポートされません）。特定の理由で_<script>_タグを保持する必要がある場合は、スクリプトストリップ機能の代わりに_iframe.designMode = "On"_を使用できます。
_htmlfile activeXObject_の信頼できるソースを見つけることができませんでした。 this source によると、htmlfileはIFrameよりも遅く、メモリリークの影響を受けやすくなっています。
影響を受けるすべての属性（href、src、...）には、_data-_が前に付きます。これらの属性を取得/変更する例を_data-href_に示します。
elem.getAttribute("data-href")およびelem.setAttribute("data-href", "...")
_elem.dataset.href_および_elem.dataset.href = "..."_。
外部リソースが無効になっています。その結果、ページは完全に異なる場合があります。
~~_<link rel="stylesheet" href="main.css" />_~~ 外部スタイルなし
~~_<script>document.body.bgColor="red";</script>_~~ スクリプトスタイルなし
_<img src="128x128.png" />_画像なし：要素のサイズは完全に異なる場合があります。

例

sanitiseHTML(html)
このブックマークレットを場所のバーに貼り付けます。テキスト領域を挿入するオプションが提供され、サニタイズされたHTML文字列が表示されます。

_javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/html-sanitizer.js";document.body.appendChild(s)})(); _

コード例-string2dom(html)：

_string2dom("<html><head><title>Test</title></head></html>", function(doc, destroy){ alert(doc.title); /* Alert: "Test" */ destroy(); }); var test = string2dom("<div id='secret'></div>"); alert(test.doc.getElementById("secret").tagName); /* Alert: "DIV" */ test.destroy(); _

注目すべき参考文献

SO：絶対URLに関連するすべてを変更するJS RE -関数sanitiseHTML(html)は、以前に作成したreplace_all_rel_by_abs(html)関数に基づいています。
要素-埋め込みコンテンツ -標準の埋め込み要素の完全なリスト
Elements-以前のHTML要素非推奨）要素の追加リスト（_<applet>_など）

---（htmlfile ActiveXオブジェクト - "iframeサンドボックスより遅い。管理されていない場合、メモリリーク"

Chris Baker · Answer

DocumentFragmentsをいじる理由がわからない場合は、HTMLテキストを新しいdiv要素のinnerHTMLとして設定できます。次に、divをDOMに追加せずに、そのdiv要素をgetElementsByTagNameなどに使用できます。

var htmlText= '<html><head><title>Test</title></head><body><div id="test_ele1">this is test_ele1 content</div><div id="test_ele2">this is test_ele content2</div></body></html>'; var d = document.createElement('div'); d.innerHTML = htmlText; console.log(d.getElementsByTagName('div'));

DocumentFragmentのアイデアに本当に慣れている場合は、このコードを使用できますが、それをdivでラップして、目的のDOM関数を取得する必要があります。

function makeDocumentFragment(htmlText) { var range = document.createRange(); var frag = range.createContextualFragment(htmlText); var d = document.createElement('div'); d.appendChild(frag); return d; }

Eli Grey · Answer

IEがdocument.implementation.createHTMLDocumentをサポートしているかどうかはわかりませんが、サポートしている場合は、このアルゴリズムを使用します（私の DOMParser HTML拡張から変更）。） DOCTYPEは保持されません。：

var doc = document.implementation.createHTMLDocument("") , doc_elt = doc.documentElement , first_elt ; doc_elt.innerHTML = your_html_here; first_elt = doc_elt.firstElementChild; if ( // are we dealing with an entire document or a fragment? doc_elt.childElementCount === 1 && first_elt.tagName.toLowerCase() === "html" ) { doc.replaceChild(first_elt, doc_elt); } // doc is an HTML document // you can now reference stuff like doc.title, etc.

Dr.Molle · Answer

HTMLも有効なXMLであると想定して、 loadXML（）を使用できます。

J&#233;r&#233;my Lal · Answer

非互換性を処理することなく、リクエストをトリガーすることなく完全なHTML DOM機能を使用するには：

var doc = document.cloneNode(); if (!doc.documentElement) { doc.appendChild(doc.createElement('html')); doc.documentElement.appendChild(doc.createElement('head')); doc.documentElement.appendChild(doc.createElement('body')); }

準備完了！ docはhtmlドキュメントですが、オンラインではありません。

Javier Pedemonte · Answer

DocumentFragmentはgetElementsByTagNameをサポートしていません-Documentのみがサポートしています。

jsdom のようなライブラリを使用する必要がある場合があります。これは、DOMの実装を提供し、getElementsByTagNameおよび他のDOM APIを使用して検索できるようにします。また、スクリプトを実行しないように設定することもできます。はい、それは「重い」ので、IE 7.で動作するかどうかはわかりません。

Pebbl · Answer

このページをさまよっただけで、少し役に立たなくなりました:)しかし、以下は同様の問題を抱えているすべての人に役立つはずです...しかし、IE7/8は今では本当に無視されるべきであり、より最近のブラウザ。

以下は、私がテストしたほぼすべてにわたって機能します-唯一の2つの欠点は次のとおりです。

ルートのdiv要素にビスポークgetElementById関数とgetElementsByName関数を追加したので、これらはツリーの下の方で期待どおりに表示されません（コードがこれに対応するように変更されていない限り）。
Doctypeは無視されます-ただし、私の経験では、Doctypeはdomの構造に影響を与えず、レンダリングの方法に影響を与えるため、大きな違いはないと思います（このメソッドでは明らかに発生しません）。

基本的に、システムは<tag>および<namespace:tag>は、ユーザーエージェントによって異なる方法で処理されます。発見されたように、特定の特別なタグはdiv要素内に存在できないため、削除されます。名前空間付き要素はどこにでも配置できます（別の方法で記述されているDTDがない限り）。これらの名前空間タグは実際には問題の実際のタグとしては動作しませんが、ドキュメントでの構造的な位置に実際に使用しているだけなので、問題は発生しません。

マークアップとコードは次のとおりです。

<!DOCTYPE html> <html> <head> <script> /// function for parsing HTML source to a dom structure /// Tested in Mac OSX, Win 7, Win XP with FF, IE 7/8/9, /// Chrome, Safari & Opera. function parseHTML(src){ /// create a random div, this will be our root var div = document.createElement('div'), /// specificy our namespace prefix ns = 'faux:', /// state which tags we will treat as "special" stn = ['html','head','body','title']; /// the reg exp for replacing the special tags re = new RegExp('<(/?)('+stn.join('|')+')([^>]*)?>','gi'), /// remember the getElementsByTagName function before we override it gtn = div.getElementsByTagName; /// a quick function to namespace certain tag names var nspace = function(tn){ if ( stn.indexOf ) { return stn.indexOf(tn) != -1 ? ns + tn : tn; } else { return ('|'+stn.join('|')+'|').indexOf(tn) != -1 ? ns + tn : tn; } }; /// search and replace our source so that special tags are namespaced /// &nbsp; required for IE7/8 to render tags before first text found /// <faux:check /> tag added so we can test how namespaces work src = '&nbsp;<'+ns+'check />' + src.replace(re,'<$1'+ns+'$2$3>'); /// inject to the div div.innerHTML = src; /// quick test to see how we support namespaces in TagName searches if ( !div.getElementsByTagName(ns+'check').length ) { ns = ''; } /// create our replacement getByName and getById functions var createGetElementByAttr = function(attr, collect){ var func = function(a,w){ var i,c,e,f,l,o; w = w||[]; if ( this.nodeType == 1 ) { if ( this.getAttribute(attr) == a ) { if ( collect ) { w.Push(this); } else { return this; } } } else { return false; } if ( (c = this.childNodes) && (l = c.length) ) { for( i=0; i<l; i++ ){ if( (e = c[i]) && (e.nodeType == 1) ) { if ( (f = func.call( e, a, w )) && !collect ) { return f; } } } } return (w.length?w:false); } return func; } /// apply these replacement functions to the div container, obviously /// you could add these to prototypes for browsers the support element /// constructors. For other browsers you could step each element and /// apply the functions through-out the node tree... however this would /// be quite messy, far better just to always call from the root node - /// or use div.getElementsByTagName.call( localElement, 'tag' ); div.getElementsByTagName = function(t){return gtn.call(this,nspace(t));} div.getElementsByName = createGetElementByAttr('name', true); div.getElementById = createGetElementByAttr('id', false); /// return the final element return div; } window.onload = function(){ /// parse the HTML source into a node tree var dom = parseHTML( document.getElementById('source').innerHTML ); /// test some look ups :) var a = dom.getElementsByTagName('head'), b = dom.getElementsByTagName('title'), c = dom.getElementsByTagName('script'), d = dom.getElementById('body'); /// alert the result alert(a[0].innerHTML); alert(b[0].innerHTML); alert(c[0].innerHTML); alert(d.innerHTML); } </script> </head> <body> <xmp id="source"> <!DOCTYPE html> <html> <head> <!-- Comment //--> <meta charset="utf-8"> <meta name="robots" content="index, follow"> <title>An example</title> <link href="test.css" /> <script>alert('of parsing..');</script> </head> <body id="body"> <b>in a similar way to createDocumentFragment</b> </body> </html> </xmp> </body> </html>