CLI валидатор микроразметки: JSON-LD + Microdata + RDFa (schema.org)
Node.js CLI проверяет structured data на странице (schema.org), печатает ошибки/предупреждения в консоль и сохраняет отчёт в JSON. Удобно для CI и ручной диагностики.
Как использовать
- Установи зависимости через npm (один раз).
- Запусти: node sd-validate.mjs <url|file.html>.
- Смотри вывод в консоли и отчёт JSON на диске (sd-report-*.json).
Этот CLI — не HTML-валидатор (W3C Nu), а проверка structured data: JSON-LD + Microdata + RDFa по модели schema.org. Нужен, когда Google/Schema ругаются на Product/Offer (price, image, ratingCount и т.д.), а HTML «валиден».
Установка
mkdir -p tools/sd-validator && cd tools/sd-validator
npm init -y
npm i @adobe/structured-data-validator @marbec/web-auto-extractor chalk yargs
Скрипт sd-validate.mjs
#!/usr/bin/env node
import fs from "node:fs/promises";
import path from "node:path";
import { createHash } from "node:crypto";
import process from "node:process";
import chalk from "chalk";
import yargs from "yargs/yargs";
import { hideBin } from "yargs/helpers";
import SDV from "@adobe/structured-data-validator";
import WebAutoExtractor from "@marbec/web-auto-extractor";
// ESM/CJS-safe import for different package builds
const Validator = SDV?.Validator ?? SDV?.default ?? SDV;
if (!Validator) throw new Error("Cannot resolve Validator export from @adobe/structured-data-validator");
const argv = yargs(hideBin(process.argv))
.scriptName("sd-validate")
.usage("$0 [options] <url|file.html>")
.option("out", {
type: "string",
default: "",
describe: "Путь для отчёта (json). По умолчанию: ./sd-report-<hash>.json",
})
.option("onlyErrors", {
type: "boolean",
default: false,
describe: "Показывать только ERROR (WARNING скрыть)",
})
.option("microdataOnly", {
type: "boolean",
default: false,
describe: "Фильтр сообщений по microdata/itemprop/itemscope/itemtype",
})
.option("timeout", {
type: "number",
default: 20000,
describe: "Таймаут загрузки URL (мс)",
})
.option("userAgent", {
type: "string",
default:
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) sd-validate/1.0",
describe: "User-Agent для скачивания HTML по URL",
})
.demandCommand(1)
.help()
.parseSync();
const input = String(argv._[0]);
function isUrl(s) {
return /^https?:\/\//i.test(s);
}
function sha1(s) {
return createHash("sha1").update(s).digest("hex").slice(0, 10);
}
async function fetchWithTimeout(url, { timeout, headers }) {
const ac = new AbortController();
const t = setTimeout(() => ac.abort(), timeout);
try {
const res = await fetch(url, { redirect: "follow", signal: ac.signal, headers });
const text = await res.text();
return { ok: res.ok, status: res.status, text };
} finally {
clearTimeout(t);
}
}
function formatIssue(i) {
const sev =
i.severity === "ERROR"
? chalk.bgRed.white(" ERROR ")
: chalk.bgYellow.black(" WARN ");
const msg = i.issueMessage ?? "Unknown issue";
const fields = Array.isArray(i.fieldNames) && i.fieldNames.length
? `fields: ${i.fieldNames.join(", ")}`
: "";
const loc = i.location ? `loc: ${i.location}` : "";
const pathStr = Array.isArray(i.path) && i.path.length
? `path: ${JSON.stringify(i.path)}`
: "";
const meta = [fields, loc, pathStr].filter(Boolean).join(" | ");
return `${sev} ${msg}${meta ? chalk.dim(` (${meta})`) : ""}`;
}
function hr() {
console.log(chalk.dim("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"));
}
async function main() {
console.log(chalk.cyan("🔍 Structured Data validation"));
hr();
console.log(`Input: ${chalk.bold(input)}`);
// 1) Load HTML
let html = "";
const sourceMeta = { type: isUrl(input) ? "url" : "file", input };
if (isUrl(input)) {
const { ok, status, text } = await fetchWithTimeout(input, {
timeout: argv.timeout,
headers: { "user-agent": argv.userAgent },
});
html = text;
sourceMeta.http = { status, ok };
if (!html || html.trim().length < 50) {
throw new Error(`Похоже, вернулся пустой/короткий HTML. HTTP=${status}`);
}
} else {
html = await fs.readFile(input, "utf8");
sourceMeta.file = { abs: path.resolve(input) };
}
// 2) Quick microdata scan (raw)
const microdataCount = (html.match(/\bitemscope\b/gi) || []).length;
const itemtypeCount = (html.match(/\bitemtype\s*=\s*["'][^"']+["']/gi) || []).length;
const itempropCount = (html.match(/\bitemprop\s*=\s*["'][^"']+["']/gi) || []).length;
console.log("");
console.log(chalk.green("📦 Found in HTML (raw scan):"));
console.log(` • itemscope: ${microdataCount}`);
console.log(` • itemtype: ${itemtypeCount}`);
console.log(` • itemprop: ${itempropCount}`);
// 3) Extract structured data (JSON-LD + microdata + RDFa)
const extractor = new WebAutoExtractor({
addLocation: true,
embedSource: ["rdfa", "microdata"],
});
const extracted = extractor.parse(html);
// 4) Load schema.org model
console.log("");
console.log(chalk.cyan("⬇️ Loading schema.org model..."));
const schemaOrgJson = await (
await fetch("https://schema.org/version/latest/schemaorg-all-https.jsonld")
).json();
// 5) Validate
console.log(chalk.cyan("🧪 Validating..."));
const validator = new Validator(schemaOrgJson);
const issuesAll = await validator.validate(extracted);
let issues = Array.isArray(issuesAll) ? issuesAll : [];
if (argv.onlyErrors) issues = issues.filter((x) => x.severity === "ERROR");
if (argv.microdataOnly) {
issues = issues.filter((x) => {
const s = JSON.stringify(x).toLowerCase();
return (
s.includes("microdata") ||
s.includes("itemprop") ||
s.includes("itemscope") ||
s.includes("itemtype")
);
});
}
const errors = issues.filter((x) => x.severity === "ERROR").length;
const warns = issues.filter((x) => x.severity !== "ERROR").length;
console.log("");
hr();
console.log(`✅ Issues total: ${chalk.bold(String(issues.length))}`);
console.log(`❌ Errors: ${errors ? chalk.red(errors) : chalk.green("0")}`);
console.log(`⚠️ Warnings: ${warns ? chalk.yellow(warns) : chalk.green("0")}`);
hr();
if (issues.length) {
console.log(chalk.cyan("📋 Details:"));
for (const i of issues) console.log(" " + formatIssue(i));
} else {
console.log(chalk.green("🎉 No issues found (for this validator)."));
}
// 6) Save report
const out =
argv.out && argv.out.trim()
? argv.out
: path.resolve(process.cwd(), `sd-report-${sha1(input)}.json`);
const report = {
version: "1.0",
ts: new Date().toISOString(),
input: sourceMeta,
stats: { issues: issues.length, errors, warnings: warns },
htmlScan: { itemscope: microdataCount, itemtype: itemtypeCount, itemprop: itempropCount },
issues,
};
await fs.writeFile(out, JSON.stringify(report, null, 2), "utf8");
console.log("");
console.log(chalk.gray(`💾 Report saved: ${out}`));
// Exit codes for CI
if (errors > 0) process.exit(1);
process.exit(0);
}
main().catch((e) => {
console.error(chalk.bgRed.white(" FAIL "), chalk.red(e?.message || String(e)));
process.exit(2);
});
Запуск
# URL
node sd-validate.mjs "https://example.com/product/sku-1"
# только ошибки (удобно для CI)
node sd-validate.mjs --onlyErrors "https://example.com/product/sku-1"
# фильтр по microdata (если нужно сузить шум)
node sd-validate.mjs --microdataOnly "https://example.com/product/sku-1"
# сохранить отчёт в конкретный файл
node sd-validate.mjs --out "./reports/product.json" "https://example.com/product/sku-1"
Что считать “реальной проблемой”
ERROR— можно валить пайплайн (exit 1)WARNING— диагностический шум (часть реально полезна: missing image/price для Product)