Skip to content

Semantic DOM Snapshot

Extract a minimal semantic representation of a page's DOM, stripping non-semantic noise while preserving meaningful content. Walks the live DOM depth-first, copies to clipboard, and downloads automatically.

When to use

Use semantic snapshots when you need a compact, machine-readable representation of a page's meaningful content. Common cases include feeding page structure to an AI agent, building accessibility audits, extracting content for search indexing, and debugging semantic markup without visual noise.

The pattern

Copy the entire block below — function definition and invocation — and paste it into the browser console. It walks the DOM, strips non-semantic elements and attributes, unwraps presentational wrappers, traverses Shadow DOM and same-origin iframes, captures live form values, copies the result to the clipboard, and downloads it as an HTML file.

javascript
/**
 * Walk the live DOM, emit a minimal semantic snapshot as clean HTML.
 * Copies to clipboard and downloads automatically.
 *
 * Usage (browser console):
 *   dom2semanticHTML()
 */
async function dom2semanticHTML() {
  const log = { ok: [], fail: [] };
  const fragment = buildSnapshot(document.body, log);
  addMetadata(fragment, log);
  
  const html = serialize(fragment, log);
  copyToClipboard(html, log);
  await download(html, snapshotFilename(), log);
  
  printLog(log);
  return html;
  
  /**
   * Walk the live DOM depth-first. Each node is routed to the first matching
   * layer.
   */
  function buildSnapshot(body, log) {
    const root = document.createDocumentFragment();
    const walker = document.createTreeWalker(
      body,
      NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
    );
    const nodeMap = new Map([[body, root]]);
    const stats = {
      removed: 0,
      unwrapped: 0,
      shadowRoots: 0,
      iframes: 0,
      formControls: 0,
    };
    
    const router = createRouter();
    router.use(Node.TEXT_NODE, copyTextNode);
    router.use("script, style, noscript, link, meta", stripNonSemantic);
    router.use(isHidden, stripHidden);
    router.use("img", semanticImage);
    router.use("input, textarea, select", semanticFormControl);
    router.use("option", semanticOption);
    router.use("a", semanticLink);
    router.use(
      "table, thead, tbody, tfoot, tr, th, td, caption",
      semanticTable,
    );
    router.use(semanticClone);
    
    let node;
    while ((node = walker.nextNode())) {
      const parent = nodeMap.get(node.parentNode);
      if (!parent) continue;
      const result = router.run(node);
      if (!result) continue;
      
      if (result.nodeType === Node.DOCUMENT_FRAGMENT_NODE) {
        // Unwrapped wrapper — map children to parent so their
        // descendants attach correctly
        const children = [...result.childNodes];
        for (const child of children) {
          parent.appendChild(child);
        }
        nodeMap.set(node, parent);
      } else {
        parent.appendChild(result);
        nodeMap.set(node, result);
      }
      
      // Traverse shadow DOM (including slotted content).
      // Use nodeMap target — if the host was unwrapped,
      // result is an empty fragment; nodeMap points to
      // the correct parent.
      const target = nodeMap.get(node);
      if (node.shadowRoot) {
        traverseShadowRoot(node, target);
      }
      // Traverse same-origin iframes
      if (node.tagName === "IFRAME" && node.contentDocument) {
        traverseIframe(node, target);
      }
    }
    
    logStats(stats, log);
    return root;
    
    function copyTextNode(node) {
      const text = node.textContent.trim();
      if (!text) return false;
      return document.createTextNode(text);
    }
    
    function stripNonSemantic() {
      stats.removed++;
      return false;
    }
    
    function stripHidden() {
      stats.removed++;
      return false;
    }
    
    function semanticImage(node) {
      const alt = node.getAttribute("alt");
      if (alt === null) {
        stats.removed++;
        return false;
      }
      const img = document.createElement("img");
      if (alt) img.setAttribute("alt", alt);
      copyAriaAttributes(node, img);
      const role = node.getAttribute("role");
      if (role) img.setAttribute("role", role);
      return img;
    }
    
    function semanticFormControl(node) {
      const tag = node.tagName.toLowerCase();
      const el = document.createElement(tag);
      stats.formControls++;
      
      for (const attr of ["type", "name", "placeholder",
        "required", "disabled", "readonly", "multiple",
        "min", "max", "step", "pattern"]) {
        if (node.hasAttribute(attr)) {
          el.setAttribute(attr, node.getAttribute(attr));
        }
      }
      copyAriaAttributes(node, el);
      const role = node.getAttribute("role");
      if (role) el.setAttribute("role", role);
      
      if (tag === "input") {
        const type = node.type;
        if (type === "checkbox" || type === "radio") {
          if (node.checked) el.setAttribute("checked", "");
        } else if (type !== "file" && node.value) {
          el.setAttribute("value", node.value);
        }
      } else if (tag === "textarea") {
        el.textContent = node.value;
      }
      
      // Associate label text
      const label = findLabel(node);
      if (label) {
        el.setAttribute("aria-label", label);
      }
      
      return el;
    }
    
    function semanticOption(node) {
      const option = document.createElement("option");
      if (node.value) {
        option.setAttribute("value", node.value);
      }
      if (node.selected) {
        option.setAttribute("selected", "");
      }
      option.textContent = node.textContent.trim();
      return option;
    }
    
    function semanticLink(node) {
      const a = document.createElement("a");
      const href = node.getAttribute("href");
      if (href) a.setAttribute("href", href);
      const rel = node.getAttribute("rel");
      if (rel) a.setAttribute("rel", rel);
      const target = node.getAttribute("target");
      if (target) a.setAttribute("target", target);
      copyAriaAttributes(node, a);
      const role = node.getAttribute("role");
      if (role) a.setAttribute("role", role);
      return a;
    }
    
    function semanticTable(node) {
      const tag = node.tagName.toLowerCase();
      const el = document.createElement(tag);
      for (const attr of ["colspan", "rowspan", "scope",
        "headers"]) {
        if (node.hasAttribute(attr)) {
          el.setAttribute(attr, node.getAttribute(attr));
        }
      }
      copyAriaAttributes(node, el);
      const role = node.getAttribute("role");
      if (role) el.setAttribute("role", role);
      return el;
    }
    
    function semanticClone(node) {
      const tag = node.tagName.toLowerCase();
      
      // Unwrap non-semantic wrappers (div, span without
      // semantic attributes)
      if (
        (tag === "div" || tag === "span") &&
        !hasSemantic(node)
      ) {
        stats.unwrapped++;
        return document.createDocumentFragment();
      }
      
      const el = document.createElement(tag);
      copySemanticAttributes(node, el);
      return el;
    }
    
    function traverseShadowRoot(hostNode, hostClone) {
      stats.shadowRoots++;
      const shadowWalker = document.createTreeWalker(
        hostNode.shadowRoot,
        NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
      );
      const shadowMap = new Map(
        [[hostNode.shadowRoot, hostClone]],
      );
      
      let sNode;
      while ((sNode = shadowWalker.nextNode())) {
        const sParent = shadowMap.get(sNode.parentNode);
        if (!sParent) continue;
        
        // Handle slotted content: when we encounter a
        // <slot>, process its assigned nodes from the
        // light DOM instead
        if (
          sNode.nodeType === Node.ELEMENT_NODE &&
          sNode.tagName === "SLOT"
        ) {
          const assigned = sNode.assignedNodes({
            flatten: true,
          });
          for (const aNode of assigned) {
            const aResult = processSubtree(aNode);
            if (aResult) sParent.appendChild(aResult);
          }
          continue;
        }
        
        const sResult = router.run(sNode);
        if (!sResult) continue;
        
        if (
          sResult.nodeType ===
            Node.DOCUMENT_FRAGMENT_NODE
        ) {
          for (const child of [...sResult.childNodes]) {
            sParent.appendChild(child);
          }
          shadowMap.set(sNode, sParent);
        } else {
          sParent.appendChild(sResult);
          shadowMap.set(sNode, sResult);
        }
      }
    }
    
    function traverseIframe(iframeNode, iframeClone) {
      try {
        const iframeBody =
          iframeNode.contentDocument.body;
        if (!iframeBody) return;
        stats.iframes++;
        const subResult = buildSnapshot(iframeBody, log);
        iframeClone.appendChild(subResult);
      } catch {
        log.fail.push(
          `iframe (cross-origin): ${iframeNode.src}`,
        );
      }
    }
    
    /**
     * Recursively process a subtree (used for slotted
     * content that lives outside the TreeWalker scope).
     */
    function processSubtree(node) {
      if (node.nodeType === Node.TEXT_NODE) {
        const text = node.textContent.trim();
        return text
          ? document.createTextNode(text)
          : null;
      }
      if (node.nodeType !== Node.ELEMENT_NODE) {
        return null;
      }
      
      const result = router.run(node);
      if (!result) return null;
      
      const container =
        result.nodeType ===
          Node.DOCUMENT_FRAGMENT_NODE
          ? result
          : result;
      for (const child of node.childNodes) {
        const childResult = processSubtree(child);
        if (childResult) {
          container.appendChild(childResult);
        }
      }
      
      return result;
    }
    
    function findLabel(input) {
      // Check for wrapping <label>
      const parent = input.closest?.("label");
      if (parent) {
        const text = parent.textContent.trim();
        if (text) return text;
      }
      // Check for [for] label
      const id = input.id;
      if (id) {
        const label = document.querySelector(
          `label[for="${CSS.escape(id)}"]`,
        );
        if (label) return label.textContent.trim();
      }
      return null;
    }
    
    function isHidden(node) {
      if (node.nodeType !== Node.ELEMENT_NODE) {
        return false;
      }
      if (node.hidden) return true;
      if (
        node.getAttribute("aria-hidden") === "true"
      ) {
        return true;
      }
      if (!node.isConnected) return false;
      const cs = getComputedStyle(node);
      return (
        cs.display === "none" ||
        cs.visibility === "hidden" ||
        cs.opacity === "0"
      );
    }
    
    function hasSemantic(element) {
      if (element.getAttribute("role")) return true;
      if (element.getAttribute("lang")) return true;
      if (element.getAttribute("title")) return true;
      for (const attr of element.attributes) {
        if (attr.name.startsWith("aria-")) return true;
      }
      return false;
    }
    
    function copyAriaAttributes(source, target) {
      for (const attr of source.attributes) {
        if (attr.name.startsWith("aria-")) {
          target.setAttribute(attr.name, attr.value);
        }
      }
    }
    
    // TODO: consider resolving aria-labelledby and aria-describedby into inline
    // aria-label and aria-description before stripping IDs. Elements relying on
    // ID-based label references currently lose their accessible names in the
    // output.
    
    function copySemanticAttributes(source, target) {
      const keep = new Set([
        "role", "alt", "title", "lang", "for",
        "href", "src", "action", "method",
        "name", "type", "value",
        "checked", "selected", "disabled",
        "readonly", "required", "placeholder",
        "target", "rel", "datetime", "open",
        "min", "max", "step", "pattern", "multiple",
        "colspan", "rowspan", "scope", "headers",
      ]);
      for (const attr of source.attributes) {
        if (keep.has(attr.name)) {
          target.setAttribute(attr.name, attr.value);
        } else if (attr.name.startsWith("aria-")) {
          target.setAttribute(attr.name, attr.value);
        }
      }
    }
  }
  
  function createRouter() {
    /**
     * First match wins. Supports numbers (nodeType),
     * selector strings, test functions, or omit for
     * a catch-all.
     */
    class Router {
      layers = [];
      
      use(...args) {
        if (args.length === 1) {
          const [handler] = args;
          this.layers.push({
            test() { return true; },
            handler,
          });
          return;
        }
        
        if (args.length >= 2) {
          const [test, handler] = args;
          
          if (Object.values(Node).includes(test)) {
            this.layers.push({
              test(node) {
                return node.nodeType === test;
              },
              handler,
            });
            return;
          }
          
          if (typeof test === "string") {
            this.layers.push({
              test(node) {
                return node.matches?.(test);
              },
              handler,
            });
            return;
          }
          
          this.layers.push({ test, handler });
        }
      }
      
      run(node) {
        for (const layer of this.layers) {
          if (layer.test(node)) {
            return layer.handler(node);
          }
        }
      }
    }
    
    return new Router();
  }
  
  function addMetadata(fragment, log) {
    const comment = document.createComment(
      ` semantic-dom snapshot | ` +
      `${new Date().toISOString()} | ` +
      `${location.href} | ` +
      `${innerWidth}x${innerHeight} `,
    );
    fragment.insertBefore(comment, fragment.firstChild);
    log.ok.push("metadata added");
  }
  
  function logStats(stats, log) {
    log.ok.push(
      `removed ${stats.removed} non-semantic elements`,
    );
    log.ok.push(
      `unwrapped ${stats.unwrapped} presentational wrappers`,
    );
    if (stats.shadowRoots) {
      log.ok.push(
        `traversed ${stats.shadowRoots} shadow roots`,
      );
    }
    if (stats.iframes) {
      log.ok.push(
        `traversed ${stats.iframes} same-origin iframes`,
      );
    }
    if (stats.formControls) {
      log.ok.push(
        `captured ${stats.formControls} form controls`,
      );
    }
  }
  
  function printLog(log) {
    console.group("dom2semanticHTML");
    for (const msg of log.ok) {
      console.log(`✅ ${msg}`);
    }
    for (const msg of log.fail) {
      console.log(`❌ ${msg}`);
    }
    console.groupEnd();
  }
  
  function serialize(fragment, log) {
    const div = document.createElement("div");
    div.appendChild(fragment.cloneNode(true));
    const html = div.innerHTML;
    log.ok.push(
      `${(html.length / 1024).toFixed(0)} KB serialized`,
    );
    return html;
  }
  
  async function download(html, filename, log) {
    const blob = new Blob([html], { type: "text/html" });
    const anchor = document.createElement("a");
    anchor.href = URL.createObjectURL(blob);
    anchor.download = filename;
    anchor.style.display = "none";
    document.body.appendChild(anchor);
    anchor.click();
    await delay(1000);
    anchor.remove();
    URL.revokeObjectURL(anchor.href);
    log.ok.push(`download: ${filename}`);
    
    async function delay(ms) {
      const { promise, resolve } =
        Promise.withResolvers();
      setTimeout(resolve, ms);
      return await promise;
    }
  }
  
  function copyToClipboard(text, log) {
    const textarea = document.createElement("textarea");
    textarea.value = text;
    textarea.style.cssText = "position:fixed;opacity:0";
    document.body.appendChild(textarea);
    textarea.select();
    try {
      document.execCommand("copy");
      log.ok.push("copied to clipboard");
    } catch {
      log.fail.push("clipboard copy failed");
    }
    textarea.remove();
  }
  
  function snapshotFilename() {
    const queryString = /[;?].*$/;
    const trailingSlashes = /\/+$/;
    const leadingSlash = /^\//;
    const pathSlashes = /\//g;
    const unsafeChars = /[\\?%*|"<>]/g;
    const repeatedUnderscores = /_+/g;
    
    const path = location.pathname
      .replace(queryString, "")
      .replace(trailingSlashes, "");
    const segments = path
      .replace(leadingSlash, "")
      .replace(pathSlashes, ".")
      .replace(unsafeChars, "_")
      .replace(repeatedUnderscores, "_");
    const name = (
      segments
        ? `${location.host}:${segments}`
        : location.host
    ).slice(0, 80);
    const timestamp = new Date()
      .toISOString()
      .replace(/:/g, "-")
      .slice(0, 19);
      
    return `${name}.${timestamp}.semantic.html`;
  }
}

dom2semanticHTML();

How it works

The snapshot is built with a TreeWalker depth-first walk rather than a bulk cloneNode(true). Each node passes through a router that dispatches to the first matching handler:

MatchHandlerEffect
Node.TEXT_NODEcopyTextNodeKeep trimmed text, drop empty
script, style, noscript, link, metastripNonSemanticDrop entirely
isHidden (computed style)stripHiddenDrop display:none, visibility:hidden
imgsemanticImageKeep alt, role, aria-* only
input, textarea, selectsemanticFormControlCapture type, name, value, label
optionsemanticOptionKeep value, selected, text
asemanticLinkKeep href, rel, target, aria-*
table, thead, tbody, ...semanticTableKeep structure attrs, strip presentation
(catch-all)semanticCloneKeep semantic attrs, unwrap div/span

After the walk, a metadata comment is prepended with snapshot date, URL, and viewport size.

What gets stripped

<script>, <style>, <noscript>, <link>, <meta>, hidden elements (detected via getComputedStyle()), presentational wrappers (<div>, <span> with no role, aria-*, lang, or title), decorative images (<img> with no alt), and non-semantic attributes (class, id, data-*, style, event handlers).

What gets preserved

Semantic HTML (<main>, <nav>, <article>, <section>, <header>, <footer>, <h1><h6>, <p>, <ul>, <ol>, <li>, <table>, <form>), accessibility attributes (role, aria-*, alt, title, for, lang), functional attributes (href, src, action, name, type, value), all visible text content, and live form input values.

Special traversals

  • Shadow DOM: traverses element.shadowRoot recursively, including slotted light-DOM content assigned to <slot> elements
  • Same-origin iframes: traverses iframe.contentDocument.body recursively

Trade-offs

ApproachProsCons
dom2semanticHTML()Compact semantic HTMLLoses visual layout
dom2html()Self-contained visual copyLarge output, keeps noise
Playwright a11y snapshotHandles shadow DOM, iframesNeeds Playwright, JSON output
Raw outerHTMLOne-linerMassive, all noise included

For automation contexts, Playwright's page.accessibility.snapshot() produces similar results with less effort but returns a JSON tree rather than semantic HTML, and it requires a Playwright context rather than working in the browser console.