TEST
JavaScript for HTML Browsers: HTML Outlines and Microdata
最終更新:
eriax
-
view
制限
- セクション構造からのアウトライン抽出は HTML: 4.4.11.1 に準拠。効率は考慮外。
- Microdata アイテム抽出は HTML: 5.2.5, 5.5.1 に準拠。ただし絶対 URI のチェックはしない。
- どちらも抽出するだけ。
使用例
アウトライン
var hapi = HTML_Outlines_and_Microdata_for_HTML_Browsers; // 文字列として確認 alert(hapi.createOutlineString(document)); // ul 要素を生成 var toc = hapi.createOutlineList(document);
Microdata
var hapi = HTML_Outlines_and_Microdata_for_HTML_Browsers; // 文書内のアイテムを JSON で抽出 var items = hapi.findAllItems(document); var jsonData = JSON.stringify(items);
ソースコード
if ('undefined' === typeof Array.prototype.map) {
Array.prototype.map = function (callbackfn) {
var thisArg = arguments[1];
var result;
var count;
var i;
if ('function' !== typeof callbackfn) {
throw new Error;
}
count = this.length;
result = new Array(count);
for (i = 0; i < count; i++) {
if (i in this) {
result[i] = callbackfn.call(thisArg, this[i], i, this);
}
}
return result;
};
}
if ('undefined' === typeof Array.prototype.indexOf) {
Array.prototype.indexOf = function (ceil, floor) {
return function (searchElement) {
var fromIndex = arguments[1];
var count = this.length;
var i = Number(fromIndex) || 0;
i = (i < 0) ? ceil(i) : floor(i);
if (i < 0) {
i += count;
}
for (; i < count; i++) {
if (i in this) {
if (this[i] === searchElement) {
return i;
}
}
}
return -1;
};
}(Math.ceil, Math.floor);
}
if ('undefined' === typeof Array.prototype.filter) {
Array.prototype.filter = function (callbackfn) {
var thisArg = arguments[1];
var result;
var count;
var i;
var v;
if ('function' !== typeof callbackfn) {
throw new Error;
}
count = this.length;
result = new Array;
for (i = 0; i < count; i++) {
if (i in this) {
if (callbackfn.call(thisArg, v = this[i], i, this)) {
result[result.length] = v;
}
}
}
return result;
};
}
////////////////////////////////////////////////////////////////////////
var HTML_Outlines_and_Microdata_for_HTML_Browsers = new function () { /*@cc_on@*/
//////////////////////
// Outlines
//
var isSectioningContent = function () {
var Names = /^(?:section|nav|article|aside)$/i;
return function (node) {
return node && (node.nodeType === 1) && Names.test(node.tagName);
};
}();
var isSectioningRoot = function () {
var Names = /^(?:body|blockquote|details|fieldset|figure|td)$/i;
return function (node) {
return node && (node.nodeType === 1) && Names.test(node.tagName);
};
}();
var isHeadingContent = function () {
var Names = /^(?:h[1-6]|hgroup)$/i;
return function (node) {
return node && (node.nodeType === 1) && Names.test(node.tagName);
};
}();
var isHeadingGroupContent = function () {
var Names = /^hgroup$/i;
return function (node) {
return node && (node.nodeType === 1) && Names.test(node.tagName);
};
}();
var getLevel = function (elt) {
if (isHeadingGroupContent(elt)) {
return getHeadingGroupLevel(elt);
}
return getSingleHeadingLevel(elt);
};
var getSingleHeadingLevel = function (elt) {
return parseInt(elt.tagName.slice(-1));
};
var getHeadingGroupLevel = function (elt) {
var maxLevel = Infinity;
var currentLevel;
var nodes = elt.childNodes;
var node;
var i;
for (i = 0; node = nodes[i++];) {
if (isHeadingContent(node)) {
currentLevel = getSingleHeadingLevel(node);
if (currentLevel < maxLevel) {
maxLevel = currentLevel;
}
}
}
return maxLevel;
};
//
function HTMLOutlinee(arg) {
if (arguments.length > 0) {
if (arg) {
var p;
for (p in arg) {
if (arg.hasOwnProperty(p)) {
this[p] = arg[p];
}
}
}
}
}
HTMLOutlinee.create = function (element) {
return new HTMLOutlinee({
element: element,
sections: [],
parent: null
});
};
HTMLOutlinee.prototype = new function () {
this.element = null;
this.sections = null;
this.parent = null;
this.type = '#outlinee';
this.addSection = function (section) {
this.sections.push(section);
section.outlinee = this;
return section;
};
this.getFirstSection = function () {
return this.sections[0];
};
this.getLastSection = function () {
var sections = this.sections;
return sections[sections.length - 1];
};
this.toString = function () {
return '[object HTMLOutlinee]';
};
};
function HTMLSection(arg) {
if (arguments.length > 0) {
if (arg) {
var p;
for (p in arg) {
if (arg.hasOwnProperty(p)) {
this[p] = arg[p];
}
}
}
}
}
HTMLSection.create = function (heading, parent, outlinee) {
return new HTMLSection({
heading: heading,
parent: parent,
children: [],
outlinee: outlinee,
associated: []
});
};
(function () {
this.heading = null;
this.parent = null;
this.children = null;
this.outlinee = null;
this.associated = null;
this.type = '#section';
this.appendChild = function (div) {
this.children.push(div);
div.parent = this;
return div;
};
this.hasChildSections = function () {
var nodes = this.children;
var nodeCount = nodes.length;
var i;
for (i = 0; i < nodeCount; i++) {
if (nodes[i] instanceof HTMLSection) {
return true;
}
}
return false;
};
this.getLastSection = function () {
var nodes = this.children;
var nodeCount = nodes.length;
var node;
var i;
for (i = nodeCount; i > 0;) {
node = nodes[i -= 1];
if (node instanceof HTMLSection) {
return node;
}
}
return null;
};
this.associate = function (node) {
this.associated.push(node);
return node;
};
this.toString = function () {
return '[object HTMLSection]';
};
}).call(HTMLSection.prototype);
function createOutlinee(root) {
var currentOutlinee = null;
var currentSection = null;
var stack = [];
(function (root, callbackfn) {
var node = root;
var n;
while (node) {
callbackfn(node, 'enter');
if ((n = node.firstChild)) {
node = n;
continue;
}
do {
callbackfn(node, 'exit');
if (node == root) {
node = null;
break;
}
if ((n = node.nextSibling)) {
node = n;
break;
}
}
while ((node = node.parentNode))
}
})(root, function (node, mode) {
var top = stack[stack.length - 1];
if (top === node && mode === 'exit') {
stack.pop();
}
else if (isHeadingContent(top)) {
;
}
else if (mode === 'enter' && (isSectioningContent(node) || isSectioningRoot(node))) {
if (currentOutlinee !== null && currentSection.heading === null) {
currentSection.heading = undefined;
}
if (currentOutlinee !== null) {
stack.push(currentOutlinee);
}
currentOutlinee = HTMLOutlinee.create(node);
currentSection = HTMLSection.create(null, currentSection, currentOutlinee);
currentOutlinee.addSection(currentSection);
}
else if (mode === 'exit' && (isSectioningContent(node) && stack.length > 0)) {
var outlinee = currentOutlinee;
currentOutlinee = stack.pop();
currentSection = currentOutlinee.getLastSection();
currentSection.appendChild(outlinee);
}
else if (mode === 'exit' && (isSectioningRoot(node) && stack.length > 0)) {
var outlinee = currentOutlinee;
currentOutlinee = stack.pop();
currentSection = currentOutlinee.getLastSection();
while (currentSection.hasChildSections()) {
currentSection = currentSection.getLastSection();
}
currentSection.appendChild(outlinee);
}
else if (mode === 'exit' && (isSectioningContent(node) || isSectioningRoot(node))) {
currentSection = currentOutlinee.getFirstSection();
}
else if (currentOutlinee === null) {}
else if (mode === 'enter' && isHeadingContent(node)) {
if (currentSection.heading == null) {
currentSection.heading = node;
}
else if (getLevel(node) <= getLevel(currentOutlinee.getLastSection().heading)) {
currentSection = currentOutlinee.addSection(HTMLSection.create(node, currentSection.parent, currentOutlinee));
}
else {
var candidateSection = currentSection;
while (true) {
if (getLevel(node) > getLevel(candidateSection.heading)) {
currentSection = candidateSection.appendChild(HTMLSection.create(node, candidateSection.parent, currentOutlinee));
break;
}
var newCandidateSection = candidateSection.parent;
candidateSection = newCandidateSection;
}
}
stack.push(node);
}
else {} if (mode === 'exit' && currentSection !== null) {
currentSection.associate(node);
}
});
if (currentOutlinee === null) {
return null;
}
return currentOutlinee;
}
var getTextContent = function (n) {
switch (n.nodeType) {
case 1:
if (/^img$/i.test(n.nodeName)) {
return n.alt;
}
if (/^input$/i.test(n.nodeName)) {
return n.value;
}
return Array.prototype.concat.apply([], Array.prototype.map.call(n.childNodes, arguments.callee)).join('');
case 3:
case 4:
return n.data;
default:
return '';
}
};
(function () {
this.isSectioningRoot = function () {
return isSectioningRoot(this.element);
};
this.hasEffectiveSections = function () {
var sections = this.sections;
var section;
switch (sections.length) {
case 0:
return false;
case 1:
return sections[0].isEffectiveSection();
default:
return true;
}
};
this.toJSON = function (internal) {
var children = [];
var sections = this.sections;
var sectionCount = sections.length;
var i;
for (i = 0; i < sectionCount; i++) {
children = children.concat(sections[i].toJSON(true));
}
if (internal && !this.isSectioningRoot()) {
return children;
}
return {
type: '#root',
context: this.element.tagName,
children: children
};
};
}).call(HTMLOutlinee.prototype);
(function () {
this.isEffectiveSection = function () {
return isSectioningContent(this.outlinee.element) || this.hasChildSections() || this.heading !== null;
};
this.toJSON = function () {
var heading = this.heading;
var tagName = this.outlinee.element.tagName;
var id;
var label;
if (heading) {
id = heading.id;
label = getTextContent(heading);
}
else {
if (heading === null) {
label = '(anonymous\x20' + tagName + ')';
}
else {
label = '(undefined\x20' + tagName + ')';
}
}
var children = this.children;
var childCount = children.length;
var result = [];
var i;
for (i = 0; i < childCount; i++) {
result = result.concat(children[i].toJSON(true));
}
return {
context: tagName,
type: '#section',
id: id,
label: label,
children: result
};
};
}).call(HTMLSection.prototype);
//
var outlineToList = function (jsonData, params) {
var doc = params.ownerDocument;
var list0 = doc.createElement(params.tagName || 'ul');
var item0 = doc.createElement('li');
var frag0 = doc.createDocumentFragment();
item0.appendChild(doc.createElement('a')).appendChild(doc.createTextNode('_'));
return (function (data) {
var list;
var item;
var frag = frag0.cloneNode(false);
var children = data.children;
var childCount = children.length;
var child;
var i;
var n;
var s;
if (childCount > 0) {
for (i = 0; i < childCount; i++) {
child = children[i];
item = item0.cloneNode(true);
n = item.firstChild;
if (child.type === '#section') {
if ((s = child.id)) {
n.href = '#' + encodeURIComponent(s);
}
n.firstChild.data = child.label;
frag.appendChild(item);
frag.appendChild(arguments.callee(child));
}
}
list = list0.cloneNode(false);
list.appendChild(frag);
}
else {
list = frag;
}
return list;
})(jsonData);
};
this.createOutlineList = function (root, doc) {
var outlinee = createOutlinee(root);
if (!doc) {
doc = (root.nodeType === 9) ? root : root.ownerDocument;
}
return outlineToList(outlinee.toJSON(), {
ownerDocument: doc,
tagName: 'ul'
});
};
var outlineToString = function (jsonData, level) {
var tab = 2;
if (!level) {
level = 1;
}
var padding;
var result = [];
if (jsonData.type === '#root') {
padding = new Array(level).join('\x20\x20') + '= ';
result.push(padding + '[' + jsonData.context + ']');
level += tab;
}
var children = jsonData.children;
var childCount = children.length;
var child;
var i;
var c;
if (childCount > 0) {
padding = new Array(level).join('\x20\x20') + '+ ';
for (i = 0; i < childCount; i++) {
child = children[i];
if (child.type === '#section') {
result.push(padding + '[' + child.context + '] ' + child.label.replace(/^\s+|\s+$/g, '').replace(/(?:\r\n|\r|\n)+/g, ' -- ').replace(/\s{2,}/g, '\x20'));
}
if ((c = arguments.callee(child, level + tab))) {
result.push(c);
}
}
}
return result.join('\n');
};
this.createOutlineString = function (root) {
var outlinee = createOutlinee(root);
return outlineToString (outlinee.toJSON());
};
//////////////////////
// Microdata
//
var findItemProperties = function (root) {
var push = Array.prototype.push;
var filter = Array.prototype.filter;
var isElement = function (n) {
return n.nodeType === 1;
};
var results = [];
var memory = [];
var pending = [];
results.push(root);
push.apply(pending, filter.call(root.childNodes, isElement));
/*@if(1)var a;if((a=root.getAttributeNode('itemref'))&&a.specified){@else@*/
if (root.hasAttribute('itemref')) { /*@end@*/
var refs;
var TrailWS = /^\s+|\s+$/g;
var WS = /\s+/;
if ((refs = root.getAttribute('itemref'/*@,0@*/)) && (refs = refs.replace(TrailWS, ''))) {
refs = refs.split(WS);
var refCount = refs.length;
var doc = root.ownerDocument;
var m;
var i;
for (i = 0; i < refCount; i++) {
if ((m = doc.getElementById(refs[i]))) {
pending.push(m);
}
}
}
}
while (pending.length > 0) {
var current = pending.shift();
if (memory.indexOf(current) >= 0) {
continue;
}
memory.push(current);
/*@if(1)var a;if(!(a=current.getAttributeNode('itemscope'))||!a.specified){@else@*/
if (!current.hasAttribute('itemscope')) { /*@end@*/
push.apply(pending, filter.call(current.childNodes, isElement));
}
/*@if(1)var a;if((a=current.getAttributeNode('itemprop'))&&a.specified){@else@*/
if (current.hasAttribute('itemprop')) { /*@end@*/
results.push(current);
}
}
if ('undefined' !== typeof root.compareDocumentPosition) {
results.sort(function (e1, e2) {
if (e1.isSameNode(e2)) {
return 0;
}
if (0 !== (e1.compareDocumentPosition(e2) & Node.DOCUMENT_POSITION_FOLLOWING)) {
return -1;
}
return 1;
});
}
else {
/*@if(1)results.sort(function(e1,e2){return e1.sourceIndex-e2.sourceIndex});@else@*/
var r1 = root.ownerDocument.createRange();
var r2 = root.ownerDocument.createRange();
results.sort(function (e1, e2) {
r1.selectNode(e1);
r2.selectNode(e2);
return r1.compareBoundaryPoints(Range.START_TO_START, r2);
}); /*@end@*/
}
return results;
};
var findAllItems_JSON = function (root) {
var result = {};
var items = [];
var elems = root.getElementsByTagName('*');
var elemCount = elems.length;
var elem;
var i;
for (i = 0; i < elemCount; i++) {
elem = elems[i];
/*@if(1)var a;if(elem.nodeType===1&&(a=elem.getAttributeNode('itemscope'))&&a.specified){@else@*/
if (elem.hasAttribute('itemscope')) { /*@end@*/
items.push(getObject_JSON(elem, []));
}
}
result['items'] = items;
return result;
};
var getObject_JSON = function (item, memory) {
var result = {};
memory.push(item);
/*@if(1)var a;if((a=item.getAttributeNode('itemtype'))&&a.specified){@else@*/
if (item.hasAttribute('itemtype')) { /*@end@*/
result['type'] = item.getAttribute('itemtype' /*@,0@*/ );
}
/*@if(1)var a;if((a=item.getAttributeNode('itemid'))&&a.specified){@else@*/
if (item.hasAttribute('itemid')) { /*@end@*/
result['id'] = item.getAttribute('itemid' /*@,0@*/ );
}
var properties = {};
var elems = findItemProperties(item);
var elemCount = elems.length;
var elem;
var i;
var TrailWS = /^\s+|\s+$/g;
var WS = /\s+/;
for (i = 0; i < elemCount; i++) {
elem = elems[i];
/*@if(1)var a;if(!(a=elem.getAttributeNode('itemprop'))||!a.specified){@else@*/
if (!elem.hasAttribute('itemprop')) { /*@end@*/
continue;
}
var names = elem.getAttribute('itemprop' /*@,0@*/ ).replace(TrailWS, '');
if (!names) {
continue;
}
names = names.split(WS);
var value;
/*@if(1)var a;if((a=elem.getAttributeNode('itemscope'))&&a.specified){@else@*/
if (elem.hasAttribute('itemscope')) { /*@end@*/
if (memory.indexOf(elem) < 0) {
value = getObject_JSON(elem, memory);
}
else {
value = 'ERROR';
}
}
else {
switch (elem.tagName.toUpperCase()) {
case 'META':
value = elem.content || '';
break;
case 'EMBED':
case 'IFRAME':
case 'IMG':
value = elem.src || '';
break;
case 'AUDIO':
case 'SOURCE':
case 'TRACK':
case 'VIDEO':
value = elem.src;
if ('undefined' === typeof value) {
value = elem.getAttribute('src' /*@,0@*/ ) || '';
}
break;
case 'A':
case 'AREA':
case 'LINK':
value = elem.href;
break;
case 'OBJECT':
value = elem.data;
break;
case 'TIME':
value = elem.dateTime;
if ('undefined' === typeof value) {
value = elem.getAttribute('datetime' /*@,0@*/ ) || '';
}
break;
default:
/*@if(1)var s;if('undefined'!==typeof(s=elem.innerText)){value=s}@else@*/
value = elem.textContent; /*@end@*/
break;
}
}
var nameCount = names.length;
var name;
var j;
for (j = 0; j < nameCount; j++) {
name = names[j];
if (properties.hasOwnProperty(name)) {
properties[name].push(value);
}
else {
properties[name] = [value];
}
}
}
result['properties'] = properties;
return result;
};
this.findAllItems = findAllItems_JSON;
};
- 初出 2011-09-04/05
- 修正 2012-03-05