Appearance
Semantic DOM Snapshot
Extract a minimal semantic representation of a page's DOM, stripping non-semantic noise while preserving meaningful content. Walks the live DOM depth-first, copies to clipboard, and downloads automatically.
When to use
Use semantic snapshots when you need a compact, machine-readable representation of a page's meaningful content. Common cases include feeding page structure to an AI agent, building accessibility audits, extracting content for search indexing, and debugging semantic markup without visual noise.
The pattern
Copy the entire block below — function definition and invocation — and paste it into the browser console. It walks the DOM, strips non-semantic elements and attributes, unwraps presentational wrappers, traverses Shadow DOM and same-origin iframes, captures live form values, copies the result to the clipboard, and downloads it as an HTML file.
javascript
/**
* Walk the live DOM, emit a minimal semantic snapshot as clean HTML.
* Copies to clipboard and downloads automatically.
*
* Usage (browser console):
* dom2semanticHTML()
*/
async function dom2semanticHTML() {
const log = { ok: [], fail: [] };
const fragment = buildSnapshot(document.body, log);
addMetadata(fragment, log);
const html = serialize(fragment, log);
copyToClipboard(html, log);
await download(html, snapshotFilename(), log);
printLog(log);
return html;
/**
* Walk the live DOM depth-first. Each node is routed to the first matching
* layer.
*/
function buildSnapshot(body, log) {
const root = document.createDocumentFragment();
const walker = document.createTreeWalker(
body,
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
);
const nodeMap = new Map([[body, root]]);
const stats = {
removed: 0,
unwrapped: 0,
shadowRoots: 0,
iframes: 0,
formControls: 0,
};
const router = createRouter();
router.use(Node.TEXT_NODE, copyTextNode);
router.use("script, style, noscript, link, meta", stripNonSemantic);
router.use(isHidden, stripHidden);
router.use("img", semanticImage);
router.use("input, textarea, select", semanticFormControl);
router.use("option", semanticOption);
router.use("a", semanticLink);
router.use(
"table, thead, tbody, tfoot, tr, th, td, caption",
semanticTable,
);
router.use(semanticClone);
let node;
while ((node = walker.nextNode())) {
const parent = nodeMap.get(node.parentNode);
if (!parent) continue;
const result = router.run(node);
if (!result) continue;
if (result.nodeType === Node.DOCUMENT_FRAGMENT_NODE) {
// Unwrapped wrapper — map children to parent so their
// descendants attach correctly
const children = [...result.childNodes];
for (const child of children) {
parent.appendChild(child);
}
nodeMap.set(node, parent);
} else {
parent.appendChild(result);
nodeMap.set(node, result);
}
// Traverse shadow DOM (including slotted content).
// Use nodeMap target — if the host was unwrapped,
// result is an empty fragment; nodeMap points to
// the correct parent.
const target = nodeMap.get(node);
if (node.shadowRoot) {
traverseShadowRoot(node, target);
}
// Traverse same-origin iframes
if (node.tagName === "IFRAME" && node.contentDocument) {
traverseIframe(node, target);
}
}
logStats(stats, log);
return root;
function copyTextNode(node) {
const text = node.textContent.trim();
if (!text) return false;
return document.createTextNode(text);
}
function stripNonSemantic() {
stats.removed++;
return false;
}
function stripHidden() {
stats.removed++;
return false;
}
function semanticImage(node) {
const alt = node.getAttribute("alt");
if (alt === null) {
stats.removed++;
return false;
}
const img = document.createElement("img");
if (alt) img.setAttribute("alt", alt);
copyAriaAttributes(node, img);
const role = node.getAttribute("role");
if (role) img.setAttribute("role", role);
return img;
}
function semanticFormControl(node) {
const tag = node.tagName.toLowerCase();
const el = document.createElement(tag);
stats.formControls++;
for (const attr of ["type", "name", "placeholder",
"required", "disabled", "readonly", "multiple",
"min", "max", "step", "pattern"]) {
if (node.hasAttribute(attr)) {
el.setAttribute(attr, node.getAttribute(attr));
}
}
copyAriaAttributes(node, el);
const role = node.getAttribute("role");
if (role) el.setAttribute("role", role);
if (tag === "input") {
const type = node.type;
if (type === "checkbox" || type === "radio") {
if (node.checked) el.setAttribute("checked", "");
} else if (type !== "file" && node.value) {
el.setAttribute("value", node.value);
}
} else if (tag === "textarea") {
el.textContent = node.value;
}
// Associate label text
const label = findLabel(node);
if (label) {
el.setAttribute("aria-label", label);
}
return el;
}
function semanticOption(node) {
const option = document.createElement("option");
if (node.value) {
option.setAttribute("value", node.value);
}
if (node.selected) {
option.setAttribute("selected", "");
}
option.textContent = node.textContent.trim();
return option;
}
function semanticLink(node) {
const a = document.createElement("a");
const href = node.getAttribute("href");
if (href) a.setAttribute("href", href);
const rel = node.getAttribute("rel");
if (rel) a.setAttribute("rel", rel);
const target = node.getAttribute("target");
if (target) a.setAttribute("target", target);
copyAriaAttributes(node, a);
const role = node.getAttribute("role");
if (role) a.setAttribute("role", role);
return a;
}
function semanticTable(node) {
const tag = node.tagName.toLowerCase();
const el = document.createElement(tag);
for (const attr of ["colspan", "rowspan", "scope",
"headers"]) {
if (node.hasAttribute(attr)) {
el.setAttribute(attr, node.getAttribute(attr));
}
}
copyAriaAttributes(node, el);
const role = node.getAttribute("role");
if (role) el.setAttribute("role", role);
return el;
}
function semanticClone(node) {
const tag = node.tagName.toLowerCase();
// Unwrap non-semantic wrappers (div, span without
// semantic attributes)
if (
(tag === "div" || tag === "span") &&
!hasSemantic(node)
) {
stats.unwrapped++;
return document.createDocumentFragment();
}
const el = document.createElement(tag);
copySemanticAttributes(node, el);
return el;
}
function traverseShadowRoot(hostNode, hostClone) {
stats.shadowRoots++;
const shadowWalker = document.createTreeWalker(
hostNode.shadowRoot,
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
);
const shadowMap = new Map(
[[hostNode.shadowRoot, hostClone]],
);
let sNode;
while ((sNode = shadowWalker.nextNode())) {
const sParent = shadowMap.get(sNode.parentNode);
if (!sParent) continue;
// Handle slotted content: when we encounter a
// <slot>, process its assigned nodes from the
// light DOM instead
if (
sNode.nodeType === Node.ELEMENT_NODE &&
sNode.tagName === "SLOT"
) {
const assigned = sNode.assignedNodes({
flatten: true,
});
for (const aNode of assigned) {
const aResult = processSubtree(aNode);
if (aResult) sParent.appendChild(aResult);
}
continue;
}
const sResult = router.run(sNode);
if (!sResult) continue;
if (
sResult.nodeType ===
Node.DOCUMENT_FRAGMENT_NODE
) {
for (const child of [...sResult.childNodes]) {
sParent.appendChild(child);
}
shadowMap.set(sNode, sParent);
} else {
sParent.appendChild(sResult);
shadowMap.set(sNode, sResult);
}
}
}
function traverseIframe(iframeNode, iframeClone) {
try {
const iframeBody =
iframeNode.contentDocument.body;
if (!iframeBody) return;
stats.iframes++;
const subResult = buildSnapshot(iframeBody, log);
iframeClone.appendChild(subResult);
} catch {
log.fail.push(
`iframe (cross-origin): ${iframeNode.src}`,
);
}
}
/**
* Recursively process a subtree (used for slotted
* content that lives outside the TreeWalker scope).
*/
function processSubtree(node) {
if (node.nodeType === Node.TEXT_NODE) {
const text = node.textContent.trim();
return text
? document.createTextNode(text)
: null;
}
if (node.nodeType !== Node.ELEMENT_NODE) {
return null;
}
const result = router.run(node);
if (!result) return null;
const container =
result.nodeType ===
Node.DOCUMENT_FRAGMENT_NODE
? result
: result;
for (const child of node.childNodes) {
const childResult = processSubtree(child);
if (childResult) {
container.appendChild(childResult);
}
}
return result;
}
function findLabel(input) {
// Check for wrapping <label>
const parent = input.closest?.("label");
if (parent) {
const text = parent.textContent.trim();
if (text) return text;
}
// Check for [for] label
const id = input.id;
if (id) {
const label = document.querySelector(
`label[for="${CSS.escape(id)}"]`,
);
if (label) return label.textContent.trim();
}
return null;
}
function isHidden(node) {
if (node.nodeType !== Node.ELEMENT_NODE) {
return false;
}
if (node.hidden) return true;
if (
node.getAttribute("aria-hidden") === "true"
) {
return true;
}
if (!node.isConnected) return false;
const cs = getComputedStyle(node);
return (
cs.display === "none" ||
cs.visibility === "hidden" ||
cs.opacity === "0"
);
}
function hasSemantic(element) {
if (element.getAttribute("role")) return true;
if (element.getAttribute("lang")) return true;
if (element.getAttribute("title")) return true;
for (const attr of element.attributes) {
if (attr.name.startsWith("aria-")) return true;
}
return false;
}
function copyAriaAttributes(source, target) {
for (const attr of source.attributes) {
if (attr.name.startsWith("aria-")) {
target.setAttribute(attr.name, attr.value);
}
}
}
// TODO: consider resolving aria-labelledby and aria-describedby into inline
// aria-label and aria-description before stripping IDs. Elements relying on
// ID-based label references currently lose their accessible names in the
// output.
function copySemanticAttributes(source, target) {
const keep = new Set([
"role", "alt", "title", "lang", "for",
"href", "src", "action", "method",
"name", "type", "value",
"checked", "selected", "disabled",
"readonly", "required", "placeholder",
"target", "rel", "datetime", "open",
"min", "max", "step", "pattern", "multiple",
"colspan", "rowspan", "scope", "headers",
]);
for (const attr of source.attributes) {
if (keep.has(attr.name)) {
target.setAttribute(attr.name, attr.value);
} else if (attr.name.startsWith("aria-")) {
target.setAttribute(attr.name, attr.value);
}
}
}
}
function createRouter() {
/**
* First match wins. Supports numbers (nodeType),
* selector strings, test functions, or omit for
* a catch-all.
*/
class Router {
layers = [];
use(...args) {
if (args.length === 1) {
const [handler] = args;
this.layers.push({
test() { return true; },
handler,
});
return;
}
if (args.length >= 2) {
const [test, handler] = args;
if (Object.values(Node).includes(test)) {
this.layers.push({
test(node) {
return node.nodeType === test;
},
handler,
});
return;
}
if (typeof test === "string") {
this.layers.push({
test(node) {
return node.matches?.(test);
},
handler,
});
return;
}
this.layers.push({ test, handler });
}
}
run(node) {
for (const layer of this.layers) {
if (layer.test(node)) {
return layer.handler(node);
}
}
}
}
return new Router();
}
function addMetadata(fragment, log) {
const comment = document.createComment(
` semantic-dom snapshot | ` +
`${new Date().toISOString()} | ` +
`${location.href} | ` +
`${innerWidth}x${innerHeight} `,
);
fragment.insertBefore(comment, fragment.firstChild);
log.ok.push("metadata added");
}
function logStats(stats, log) {
log.ok.push(
`removed ${stats.removed} non-semantic elements`,
);
log.ok.push(
`unwrapped ${stats.unwrapped} presentational wrappers`,
);
if (stats.shadowRoots) {
log.ok.push(
`traversed ${stats.shadowRoots} shadow roots`,
);
}
if (stats.iframes) {
log.ok.push(
`traversed ${stats.iframes} same-origin iframes`,
);
}
if (stats.formControls) {
log.ok.push(
`captured ${stats.formControls} form controls`,
);
}
}
function printLog(log) {
console.group("dom2semanticHTML");
for (const msg of log.ok) {
console.log(`✅ ${msg}`);
}
for (const msg of log.fail) {
console.log(`❌ ${msg}`);
}
console.groupEnd();
}
function serialize(fragment, log) {
const div = document.createElement("div");
div.appendChild(fragment.cloneNode(true));
const html = div.innerHTML;
log.ok.push(
`${(html.length / 1024).toFixed(0)} KB serialized`,
);
return html;
}
async function download(html, filename, log) {
const blob = new Blob([html], { type: "text/html" });
const anchor = document.createElement("a");
anchor.href = URL.createObjectURL(blob);
anchor.download = filename;
anchor.style.display = "none";
document.body.appendChild(anchor);
anchor.click();
await delay(1000);
anchor.remove();
URL.revokeObjectURL(anchor.href);
log.ok.push(`download: ${filename}`);
async function delay(ms) {
const { promise, resolve } =
Promise.withResolvers();
setTimeout(resolve, ms);
return await promise;
}
}
function copyToClipboard(text, log) {
const textarea = document.createElement("textarea");
textarea.value = text;
textarea.style.cssText = "position:fixed;opacity:0";
document.body.appendChild(textarea);
textarea.select();
try {
document.execCommand("copy");
log.ok.push("copied to clipboard");
} catch {
log.fail.push("clipboard copy failed");
}
textarea.remove();
}
function snapshotFilename() {
const queryString = /[;?].*$/;
const trailingSlashes = /\/+$/;
const leadingSlash = /^\//;
const pathSlashes = /\//g;
const unsafeChars = /[\\?%*|"<>]/g;
const repeatedUnderscores = /_+/g;
const path = location.pathname
.replace(queryString, "")
.replace(trailingSlashes, "");
const segments = path
.replace(leadingSlash, "")
.replace(pathSlashes, ".")
.replace(unsafeChars, "_")
.replace(repeatedUnderscores, "_");
const name = (
segments
? `${location.host}:${segments}`
: location.host
).slice(0, 80);
const timestamp = new Date()
.toISOString()
.replace(/:/g, "-")
.slice(0, 19);
return `${name}.${timestamp}.semantic.html`;
}
}
dom2semanticHTML();How it works
The snapshot is built with a TreeWalker depth-first walk rather than a bulk cloneNode(true). Each node passes through a router that dispatches to the first matching handler:
| Match | Handler | Effect |
|---|---|---|
Node.TEXT_NODE | copyTextNode | Keep trimmed text, drop empty |
script, style, noscript, link, meta | stripNonSemantic | Drop entirely |
isHidden (computed style) | stripHidden | Drop display:none, visibility:hidden |
img | semanticImage | Keep alt, role, aria-* only |
input, textarea, select | semanticFormControl | Capture type, name, value, label |
option | semanticOption | Keep value, selected, text |
a | semanticLink | Keep href, rel, target, aria-* |
table, thead, tbody, ... | semanticTable | Keep structure attrs, strip presentation |
| (catch-all) | semanticClone | Keep semantic attrs, unwrap div/span |
After the walk, a metadata comment is prepended with snapshot date, URL, and viewport size.
What gets stripped
<script>, <style>, <noscript>, <link>, <meta>, hidden elements (detected via getComputedStyle()), presentational wrappers (<div>, <span> with no role, aria-*, lang, or title), decorative images (<img> with no alt), and non-semantic attributes (class, id, data-*, style, event handlers).
What gets preserved
Semantic HTML (<main>, <nav>, <article>, <section>, <header>, <footer>, <h1>–<h6>, <p>, <ul>, <ol>, <li>, <table>, <form>), accessibility attributes (role, aria-*, alt, title, for, lang), functional attributes (href, src, action, name, type, value), all visible text content, and live form input values.
Special traversals
- Shadow DOM: traverses
element.shadowRootrecursively, including slotted light-DOM content assigned to<slot>elements - Same-origin iframes: traverses
iframe.contentDocument.bodyrecursively
Trade-offs
| Approach | Pros | Cons |
|---|---|---|
dom2semanticHTML() | Compact semantic HTML | Loses visual layout |
dom2html() | Self-contained visual copy | Large output, keeps noise |
| Playwright a11y snapshot | Handles shadow DOM, iframes | Needs Playwright, JSON output |
Raw outerHTML | One-liner | Massive, all noise included |
For automation contexts, Playwright's page.accessibility.snapshot() produces similar results with less effort but returns a JSON tree rather than semantic HTML, and it requires a Playwright context rather than working in the browser console.