JS
#schema-org#structured-data#microdata#json-ld#seo#cli#nodejs#validation

CLI валидатор микроразметки: JSON-LD + Microdata + RDFa (schema.org)

Node.js CLI проверяет structured data на странице (schema.org), печатает ошибки/предупреждения в консоль и сохраняет отчёт в JSON. Удобно для CI и ручной диагностики.

Как использовать

  1. Установи зависимости через npm (один раз).
  2. Запусти: node sd-validate.mjs <url|file.html>.
  3. Смотри вывод в консоли и отчёт JSON на диске (sd-report-*.json).

Этот CLI — не HTML-валидатор (W3C Nu), а проверка structured data: JSON-LD + Microdata + RDFa по модели schema.org. Нужен, когда Google/Schema ругаются на Product/Offer (price, image, ratingCount и т.д.), а HTML «валиден».

Установка

mkdir -p tools/sd-validator && cd tools/sd-validator
npm init -y
npm i @adobe/structured-data-validator @marbec/web-auto-extractor chalk yargs

Скрипт sd-validate.mjs

#!/usr/bin/env node
import fs from "node:fs/promises";
import path from "node:path";
import { createHash } from "node:crypto";
import process from "node:process";

import chalk from "chalk";
import yargs from "yargs/yargs";
import { hideBin } from "yargs/helpers";

import SDV from "@adobe/structured-data-validator";
import WebAutoExtractor from "@marbec/web-auto-extractor";

// ESM/CJS-safe import for different package builds
const Validator = SDV?.Validator ?? SDV?.default ?? SDV;
if (!Validator) throw new Error("Cannot resolve Validator export from @adobe/structured-data-validator");

const argv = yargs(hideBin(process.argv))
  .scriptName("sd-validate")
  .usage("$0 [options] <url|file.html>")
  .option("out", {
    type: "string",
    default: "",
    describe: "Путь для отчёта (json). По умолчанию: ./sd-report-<hash>.json",
  })
  .option("onlyErrors", {
    type: "boolean",
    default: false,
    describe: "Показывать только ERROR (WARNING скрыть)",
  })
  .option("microdataOnly", {
    type: "boolean",
    default: false,
    describe: "Фильтр сообщений по microdata/itemprop/itemscope/itemtype",
  })
  .option("timeout", {
    type: "number",
    default: 20000,
    describe: "Таймаут загрузки URL (мс)",
  })
  .option("userAgent", {
    type: "string",
    default:
      "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) sd-validate/1.0",
    describe: "User-Agent для скачивания HTML по URL",
  })
  .demandCommand(1)
  .help()
  .parseSync();

const input = String(argv._[0]);

function isUrl(s) {
  return /^https?:\/\//i.test(s);
}

function sha1(s) {
  return createHash("sha1").update(s).digest("hex").slice(0, 10);
}

async function fetchWithTimeout(url, { timeout, headers }) {
  const ac = new AbortController();
  const t = setTimeout(() => ac.abort(), timeout);

  try {
    const res = await fetch(url, { redirect: "follow", signal: ac.signal, headers });
    const text = await res.text();
    return { ok: res.ok, status: res.status, text };
  } finally {
    clearTimeout(t);
  }
}

function formatIssue(i) {
  const sev =
    i.severity === "ERROR"
      ? chalk.bgRed.white(" ERROR ")
      : chalk.bgYellow.black(" WARN  ");

  const msg = i.issueMessage ?? "Unknown issue";
  const fields = Array.isArray(i.fieldNames) && i.fieldNames.length
    ? `fields: ${i.fieldNames.join(", ")}`
    : "";

  const loc = i.location ? `loc: ${i.location}` : "";
  const pathStr = Array.isArray(i.path) && i.path.length
    ? `path: ${JSON.stringify(i.path)}`
    : "";

  const meta = [fields, loc, pathStr].filter(Boolean).join(" | ");
  return `${sev} ${msg}${meta ? chalk.dim(`  (${meta})`) : ""}`;
}

function hr() {
  console.log(chalk.dim("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"));
}

async function main() {
  console.log(chalk.cyan("🔍 Structured Data validation"));
  hr();
  console.log(`Input: ${chalk.bold(input)}`);

  // 1) Load HTML
  let html = "";
  const sourceMeta = { type: isUrl(input) ? "url" : "file", input };

  if (isUrl(input)) {
    const { ok, status, text } = await fetchWithTimeout(input, {
      timeout: argv.timeout,
      headers: { "user-agent": argv.userAgent },
    });
    html = text;
    sourceMeta.http = { status, ok };

    if (!html || html.trim().length < 50) {
      throw new Error(`Похоже, вернулся пустой/короткий HTML. HTTP=${status}`);
    }
  } else {
    html = await fs.readFile(input, "utf8");
    sourceMeta.file = { abs: path.resolve(input) };
  }

  // 2) Quick microdata scan (raw)
  const microdataCount = (html.match(/\bitemscope\b/gi) || []).length;
  const itemtypeCount = (html.match(/\bitemtype\s*=\s*["'][^"']+["']/gi) || []).length;
  const itempropCount = (html.match(/\bitemprop\s*=\s*["'][^"']+["']/gi) || []).length;

  console.log("");
  console.log(chalk.green("📦 Found in HTML (raw scan):"));
  console.log(`  • itemscope: ${microdataCount}`);
  console.log(`  • itemtype:  ${itemtypeCount}`);
  console.log(`  • itemprop:  ${itempropCount}`);

  // 3) Extract structured data (JSON-LD + microdata + RDFa)
  const extractor = new WebAutoExtractor({
    addLocation: true,
    embedSource: ["rdfa", "microdata"],
  });
  const extracted = extractor.parse(html);

  // 4) Load schema.org model
  console.log("");
  console.log(chalk.cyan("⬇️  Loading schema.org model..."));
  const schemaOrgJson = await (
    await fetch("https://schema.org/version/latest/schemaorg-all-https.jsonld")
  ).json();

  // 5) Validate
  console.log(chalk.cyan("🧪 Validating..."));
  const validator = new Validator(schemaOrgJson);
  const issuesAll = await validator.validate(extracted);

  let issues = Array.isArray(issuesAll) ? issuesAll : [];
  if (argv.onlyErrors) issues = issues.filter((x) => x.severity === "ERROR");

  if (argv.microdataOnly) {
    issues = issues.filter((x) => {
      const s = JSON.stringify(x).toLowerCase();
      return (
        s.includes("microdata") ||
        s.includes("itemprop") ||
        s.includes("itemscope") ||
        s.includes("itemtype")
      );
    });
  }

  const errors = issues.filter((x) => x.severity === "ERROR").length;
  const warns = issues.filter((x) => x.severity !== "ERROR").length;

  console.log("");
  hr();
  console.log(`✅ Issues total: ${chalk.bold(String(issues.length))}`);
  console.log(`❌ Errors:      ${errors ? chalk.red(errors) : chalk.green("0")}`);
  console.log(`⚠️  Warnings:   ${warns ? chalk.yellow(warns) : chalk.green("0")}`);
  hr();

  if (issues.length) {
    console.log(chalk.cyan("📋 Details:"));
    for (const i of issues) console.log("  " + formatIssue(i));
  } else {
    console.log(chalk.green("🎉 No issues found (for this validator)."));
  }

  // 6) Save report
  const out =
    argv.out && argv.out.trim()
      ? argv.out
      : path.resolve(process.cwd(), `sd-report-${sha1(input)}.json`);

  const report = {
    version: "1.0",
    ts: new Date().toISOString(),
    input: sourceMeta,
    stats: { issues: issues.length, errors, warnings: warns },
    htmlScan: { itemscope: microdataCount, itemtype: itemtypeCount, itemprop: itempropCount },
    issues,
  };

  await fs.writeFile(out, JSON.stringify(report, null, 2), "utf8");
  console.log("");
  console.log(chalk.gray(`💾 Report saved: ${out}`));

  // Exit codes for CI
  if (errors > 0) process.exit(1);
  process.exit(0);
}

main().catch((e) => {
  console.error(chalk.bgRed.white(" FAIL "), chalk.red(e?.message || String(e)));
  process.exit(2);
});

Запуск

# URL
node sd-validate.mjs "https://example.com/product/sku-1"

# только ошибки (удобно для CI)
node sd-validate.mjs --onlyErrors "https://example.com/product/sku-1"

# фильтр по microdata (если нужно сузить шум)
node sd-validate.mjs --microdataOnly "https://example.com/product/sku-1"

# сохранить отчёт в конкретный файл
node sd-validate.mjs --out "./reports/product.json" "https://example.com/product/sku-1"

Что считать “реальной проблемой”

  • ERROR — можно валить пайплайн (exit 1)
  • WARNING — диагностический шум (часть реально полезна: missing image/price для Product)