2025-09-05 14:59:21 +08:00

242 lines
7.2 KiB
JavaScript

import { createDefu } from "defu";
import { withoutLeadingSlash } from "ufo";
export function parseRobotsTxt(s) {
const groups = [];
const sitemaps = [];
const errors = [];
let createNewGroup = false;
let currentGroup = {
comment: [],
// comments are too hard to parse in a logical order, we'll just omit them
disallow: [],
allow: [],
userAgent: []
};
let ln = -1;
for (const _line of s.split("\n")) {
ln++;
const [line] = _line.split("#").map((s2) => s2.trim());
const sepIndex = line.indexOf(":");
if (sepIndex === -1)
continue;
const rule = line.substring(0, sepIndex).trim().toLowerCase();
const val = line.substring(sepIndex + 1).trim();
switch (rule) {
case "user-agent":
case "useragent":
case "user agent":
if (createNewGroup) {
groups.push({
...currentGroup
});
currentGroup = {
comment: [],
disallow: [],
allow: [],
userAgent: []
};
createNewGroup = false;
}
currentGroup.userAgent.push(val);
break;
case "allow":
currentGroup.allow.push(val);
createNewGroup = true;
break;
case "disallow":
case "dissallow":
case "dissalow":
case "disalow":
case "diasllow":
case "disallaw":
currentGroup.disallow.push(val);
createNewGroup = true;
break;
case "sitemap":
case "site-map":
sitemaps.push(val);
break;
case "host":
currentGroup.host = val;
break;
case "clean-param":
if (currentGroup.userAgent.some((u) => u.toLowerCase().includes("yandex"))) {
currentGroup.cleanParam = currentGroup.cleanParam || [];
currentGroup.cleanParam.push(val);
} else {
errors.push(`L${ln}: Clean-param directive is only when targeting Yandex user agent.`);
}
break;
default:
errors.push(`L${ln}: Unknown directive ${rule} `);
break;
}
}
groups.push({
...currentGroup
});
return {
groups,
sitemaps,
errors
};
}
function validateGroupRules(group, errors) {
const toCheck = ["allow", "disallow"];
toCheck.forEach((key) => {
(group[key] || []).filter((rule) => {
if (rule === "")
return true;
if (!rule.startsWith("/") && !rule.startsWith("*")) {
errors.push(`Disallow rule "${rule}" must start with a \`/\` or be a \`*\`.`);
return false;
}
return true;
});
});
}
function matches(pattern, path) {
const pathLength = path.length;
const patternLength = pattern.length;
const matchingLengths = Array.from({ length: pathLength + 1 }).fill(0);
let numMatchingLengths = 1;
let p = 0;
while (p < patternLength) {
if (pattern[p] === "$" && p + 1 === patternLength) {
return matchingLengths[numMatchingLengths - 1] === pathLength;
}
if (pattern[p] === "*") {
numMatchingLengths = pathLength - matchingLengths[0] + 1;
for (let i = 1; i < numMatchingLengths; i++) {
matchingLengths[i] = matchingLengths[i - 1] + 1;
}
} else {
let numMatches = 0;
for (let i = 0; i < numMatchingLengths; i++) {
const matchLength = matchingLengths[i];
if (matchLength < pathLength && path[matchLength] === pattern[p]) {
matchingLengths[numMatches++] = matchLength + 1;
}
}
if (numMatches === 0) {
return false;
}
numMatchingLengths = numMatches;
}
p++;
}
return true;
}
export function matchPathToRule(path, _rules) {
let matchedRule = null;
const rules = _rules.filter(Boolean);
const rulesLength = rules.length;
let i = 0;
while (i < rulesLength) {
const rule = rules[i];
if (!matches(rule.pattern, path)) {
i++;
continue;
}
if (!matchedRule || rule.pattern.length > matchedRule.pattern.length) {
matchedRule = rule;
} else if (rule.pattern.length === matchedRule.pattern.length && rule.allow && !matchedRule.allow) {
matchedRule = rule;
}
i++;
}
return matchedRule;
}
export function validateRobots(robotsTxt) {
robotsTxt.groups = robotsTxt.groups.filter((group) => {
if (!group.allow.length && !group.disallow.length) {
robotsTxt.errors.push(`Group "${group.userAgent.join(", ")}" has no allow or disallow rules. You must provide one of either.`);
return false;
}
validateGroupRules(group, robotsTxt.errors);
return true;
});
return robotsTxt;
}
export function asArray(v) {
return typeof v === "undefined" ? [] : Array.isArray(v) ? v : [v];
}
export function normalizeGroup(group) {
const disallow = asArray(group.disallow);
const allow = asArray(group.allow).filter((rule) => Boolean(rule));
return {
...group,
userAgent: group.userAgent ? asArray(group.userAgent) : ["*"],
disallow,
allow,
_indexable: !disallow.includes((rule) => rule === "/"),
_rules: [
...disallow.filter(Boolean).map((r) => ({ pattern: r, allow: false })),
...allow.map((r) => ({ pattern: r, allow: true }))
]
};
}
export function generateRobotsTxt({ groups, sitemaps }) {
const lines = [];
for (const group of groups) {
for (const comment of group.comment || [])
lines.push(`# ${comment}`);
for (const userAgent of group.userAgent || ["*"])
lines.push(`User-agent: ${userAgent}`);
for (const allow of group.allow || [])
lines.push(`Allow: ${allow}`);
for (const disallow of group.disallow || [])
lines.push(`Disallow: ${disallow}`);
for (const cleanParam of group.cleanParam || [])
lines.push(`Clean-param: ${cleanParam}`);
lines.push("");
}
for (const sitemap of sitemaps)
lines.push(`Sitemap: ${sitemap}`);
return lines.join("\n");
}
const merger = createDefu((obj, key, value) => {
if (Array.isArray(obj[key]) && Array.isArray(value))
obj[key] = Array.from(/* @__PURE__ */ new Set([...obj[key], ...value]));
return obj[key];
});
export function mergeOnKey(arr, key) {
const res = {};
arr.forEach((item) => {
const k = item[key];
res[k] = merger(item, res[k] || {});
});
return Object.values(res);
}
export function isInternalRoute(_path) {
const path = withoutLeadingSlash(_path);
if (path.startsWith(".") || path.startsWith("_"))
return true;
if (path.startsWith("cgi-bin") || path.startsWith("cdn-cgi") || path.startsWith("api"))
return true;
const lastSegment = path.split("/").pop() || path;
return lastSegment.includes(".") || path.startsWith("@");
}
export function normaliseRobotsRouteRule(config) {
let allow;
if (typeof config.robots === "boolean")
allow = config.robots;
else if (typeof config.robots === "object" && typeof config.robots.indexable !== "undefined")
allow = config.robots.indexable;
else if (typeof config.index !== "undefined")
allow = config.index;
let rule;
if (typeof config.robots === "object" && typeof config.robots.rule !== "undefined")
rule = config.robots.rule;
else if (typeof config.robots === "string")
rule = config.robots;
if (rule && !allow)
allow = rule !== "none" && !rule.includes("noindex");
if (typeof allow === "undefined" && typeof rule === "undefined")
return;
return {
allow,
rule
};
}