3 周之前 · de7dbfe6d6
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -18,7 +18,9 @@ blueskybot
 
															 boi
														
 
															 botnet
														
 
															 BPort
														
 
															+Brightbot
														
 
															 broked
														
 
															+Bytespider
														
 
															 cachebuster
														
 
															 Caddyfile
														
 
															 caninetools
														
@@ -41,6 +43,7 @@ cloudflare
 
															 confd
														
 
															 containerbuild
														
 
															 coreutils
														
 
															+Cotoyogi
														
 
															 CRDs
														
 
															 crt
														
 
															 daemonizing
														
@@ -49,6 +52,7 @@ Debian
 
															 debrpm
														
 
															 decaymap
														
 
															 decompiling
														
 
															+Diffbot
														
 
															 discordapp
														
 
															 discordbot
														
 
															 distros
														
@@ -66,11 +70,15 @@ everyones
 
															 evilbot
														
 
															 evilsite
														
 
															 expressionorlist
														
 
															+externalagent
														
 
															+externalfetcher
														
 
															 extldflags
														
 
															 facebookgo
														
 
															+Factset
														
 
															 fastcgi
														
 
															 fediverse
														
 
															 finfos
														
 
															+Firecrawl
														
 
															 flagenv
														
 
															 Fordola
														
 
															 forgejo
														
@@ -86,6 +94,7 @@ googlebot
 
															 govulncheck
														
 
															 GPG
														
 
															 GPT
														
 
															+gptbot
														
 
															 grw
														
 
															 Hashcash
														
 
															 hashrate
														
@@ -97,8 +106,11 @@ hostable
 
															 htmx
														
 
															 httpdebug
														
 
															 hypertext
														
 
															+iaskspider
														
 
															 iat
														
 
															 ifm
														
 
															+Imagesift
														
 
															+imgproxy
														
 
															 inp
														
 
															 iss
														
 
															 isset
														
@@ -146,11 +158,15 @@ nginx
 
															 nobots
														
 
															 NONINFRINGEMENT
														
 
															 nosleep
														
 
															+OCOB
														
 
															 ogtags
														
 
															+omgili
														
 
															+omgilibot
														
 
															 onionservice
														
 
															 openai
														
 
															 openrc
														
 
															 pag
														
 
															+Pangu
														
 
															 parseable
														
 
															 passthrough
														
 
															 Patreon
														
@@ -185,18 +201,22 @@ RUnlock
 
															 sas
														
 
															 sasl
														
 
															 Scumm
														
 
															+searchbot
														
 
															 searx
														
 
															 sebest
														
 
															 secretplans
														
 
															 selfsigned
														
 
															+Semrush
														
 
															 setsebool
														
 
															 shellcheck
														
 
															+Sidetrade
														
 
															 sitemap
														
 
															 sls
														
 
															 sni
														
 
															 Sourceware
														
 
															 Spambot
														
 
															 sparkline
														
 
															+spyderbot
														
 
															 srv
														
 
															 stackoverflow
														
 
															 startprecmd
														
@@ -212,12 +232,15 @@ techarohq
 
															 templ
														
 
															 templruntime
														
 
															 testarea
														
 
															+Tik
														
 
															+Timpibot
														
 
															 torproject
														
 
															 traefik
														
 
															 unixhttpd
														
 
															 unmarshal
														
 
															 uvx
														
 
															 Varis
														
 
															+Velen
														
 
															 vendored
														
 
															 vhosts
														
 
															 videotest
														
@@ -227,9 +250,11 @@ webmaster
 
															 webpage
														
 
															 websecure
														
 
															 websites
														
 
															+Webzio
														
 
															 wordpress
														
 
															 Workaround
														
 
															 workdir
														
 
															+wpbot
														
 
															 xcaddy
														
 
															 Xeact
														
 
															 xeiaso
														
--- a/data/botPolicies.json
+++ b/data/botPolicies.json
@@ -4,7 +4,7 @@
 
															       "import": "(data)/bots/_deny-pathological.yaml"
														
 
															     },
														
 
															     {
														
 
															-      "import": "(data)/bots/ai-robots-txt.yaml"
														
 
															+      "import": "(data)/meta/ai-block-aggressive.yaml"
														
 
															     },
														
 
															     {
														
 
															       "import": "(data)/crawlers/_allow-good.yaml"
														
--- a/data/botPolicies.yaml
+++ b/data/botPolicies.yaml
@@ -17,8 +17,12 @@ bots:
 
															     import: (data)/bots/_deny-pathological.yaml
														
 
															   - import: (data)/bots/aggressive-brazilian-scrapers.yaml
														
 
															-  # Enforce https://github.com/ai-robots-txt/ai.robots.txt
														
 
															-  - import: (data)/bots/ai-robots-txt.yaml
														
 
															+  # Aggressively block AI/LLM related bots/agents by default
														
 
															+  - import: (data)/meta/ai-block-aggressive.yaml
														
 
															+
														
 
															+  # Consider replacing the aggressive AI policy with more selective policies:
														
 
															+  # - import: (data)/meta/ai-block-moderate.yaml
														
 
															+  # - import: (data)/meta/ai-block-permissive.yaml
														
 
															   # Search engine crawlers to allow, defaults to:
														
 
															   #   - Google (so they don't try to bypass Anubis)
														
--- a/data/bots/ai-catchall.yaml
+++ b/data/bots/ai-catchall.yaml
@@ -0,0 +1,11 @@
 
															+# Extensive list of AI-affiliated agents based on https://github.com/ai-robots-txt/ai.robots.txt
														
 
															+# Add new/undocumented agents here. Where documentation exists, consider moving to dedicated policy files.
														
 
															+# Notes on various agents:
														
 
															+#  - Amazonbot: Well documented, but they refuse to state which agent collects training data.
														
 
															+#  - anthropic-ai/Claude-Web: Undocumented by Anthropic. Possibly deprecated or hallucinations?
														
 
															+#  - Perplexity*: Well documented, but they refuse to state which agent collects training data.
														
 
															+# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
														
 
															+- name: "ai-catchall"
														
 
															+  user_agent_regex: >-
														
 
															+    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
														
 
															+  action: DENY
														
--- a/data/bots/ai-robots-txt.yaml
+++ b/data/bots/ai-robots-txt.yaml
@@ -1,6 +0,0 @@
 
															-# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
														
 
															-# Note: Blocks human-directed/non-training user agents
														
 
															-- name: "ai-robots-txt"
														
 
															-  user_agent_regex: >-
														
 
															-    AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
														
 
															-  action: DENY
														
--- a/data/clients/ai.yaml
+++ b/data/clients/ai.yaml
@@ -0,0 +1,8 @@
 
															+# User agents that act on behalf of humans in AI tools, e.g. searching the web.
														
 
															+# Each entry should have a positive/ALLOW entry created as well, with further documentation.
														
 
															+# Exceptions:
														
 
															+#  - Claude-User: No published IP allowlist
														
 
															+- name: "ai-clients"
														
 
															+  user_agent_regex: >-
														
 
															+    ChatGPT-User|Claude-User|MistralAI-User
														
 
															+  action: DENY
														
--- a/data/crawlers/ai-search.yaml
+++ b/data/crawlers/ai-search.yaml
@@ -0,0 +1,8 @@
 
															+# User agents that index exclusively for search in for AI systems.
														
 
															+# Each entry should have a positive/ALLOW entry created as well, with further documentation.
														
 
															+# Exceptions:
														
 
															+#  - Claude-SearchBot: No published IP allowlist
														
 
															+- name: "ai-crawlers-search"
														
 
															+  user_agent_regex: >-
														
 
															+    OAI-SearchBot|Claude-SearchBot
														
 
															+  action: DENY
														
--- a/data/crawlers/ai-training.yaml
+++ b/data/crawlers/ai-training.yaml
@@ -0,0 +1,8 @@
 
															+# User agents that crawl for training AI/LLM systems
														
 
															+# Each entry should have a positive/ALLOW entry created as well, with further documentation.
														
 
															+# Exceptions:
														
 
															+#  - ClaudeBot: No published IP allowlist
														
 
															+- name: "ai-crawlers-training"
														
 
															+  user_agent_regex: >-
														
 
															+    GPTBot|ClaudeBot
														
 
															+  action: DENY
														
--- a/data/embed.go
+++ b/data/embed.go
@@ -3,6 +3,6 @@ package data
 
															 import "embed"
														
 
															 var (
														
 
															-	//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers
														
 
															+	//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers all:meta
														
 
															 	BotPolicies embed.FS
														
 
															 )
														
--- a/data/meta/README.md
+++ b/data/meta/README.md
@@ -0,0 +1,5 @@
 
															+# meta policies
														
 
															+
														
 
															+Contains policies that exclusively reference policies in _multiple_ other data folders.
														
 
															+
														
 
															+Akin to "stances" that the administrator can take, with reference to various topics, such as AI/LLM systems.
														
--- a/data/meta/ai-block-aggressive.yaml
+++ b/data/meta/ai-block-aggressive.yaml
@@ -0,0 +1,6 @@
 
															+# Blocks all AI/LLM associated user agents, regardless of purpose or human agency
														
 
															+# Warning: To completely block some AI/LLM training, such as with Google, you _must_ place flags in robots.txt.
														
 
															+- import: (data)/bots/ai-catchall.yaml
														
 
															+- import: (data)/clients/ai.yaml
														
 
															+- import: (data)/crawlers/ai-search.yaml
														
 
															+- import: (data)/crawlers/ai-training.yaml
														
--- a/data/meta/ai-block-moderate.yaml
+++ b/data/meta/ai-block-moderate.yaml
@@ -0,0 +1,7 @@
 
															+# Blocks all AI/LLM bots used for training or unknown/undocumented purposes.
														
 
															+# Permits user agents with explicitly documented non-training use, and published IP allowlists.
														
 
															+- import: (data)/bots/ai-catchall.yaml
														
 
															+- import: (data)/crawlers/ai-training.yaml
														
 
															+- import: (data)/crawlers/openai-searchbot.yaml
														
 
															+- import: (data)/clients/openai-chatgpt-user.yaml
														
 
															+- import: (data)/clients/mistral-mistralai-user.yaml
														
--- a/data/meta/ai-block-permissive.yaml
+++ b/data/meta/ai-block-permissive.yaml
@@ -0,0 +1,6 @@
 
															+# Permits all well documented AI/LLM user agents with published IP allowlists.
														
 
															+- import: (data)/bots/ai-catchall.yaml
														
 
															+- import: (data)/crawlers/openai-searchbot.yaml
														
 
															+- import: (data)/crawlers/openai-gptbot.yaml
														
 
															+- import: (data)/clients/openai-chatgpt-user.yaml
														
 
															+- import: (data)/clients/mistral-mistralai-user.yaml
														
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
															 - Added `--version` flag.
														
 
															 - Added `anubis_proxied_requests_total` metric to count proxied requests.
														
 
															 - Add `Applebot` as "good" web crawler
														
 
															+- Reorganize AI/LLM crawler blocking into three separate stances, maintaining existing status quo as default.
														
 
															+- Split out AI/LLM user agent blocking policies, adding documentation for each.
														
 
															 ## v1.18.0: Varis zos Galvus
														
--- a/docs/docs/admin/configuration/import.mdx
+++ b/docs/docs/admin/configuration/import.mdx
@@ -14,7 +14,7 @@ EG:
 
															 {
														
 
															   "bots": [
														
 
															     {
														
 
															-      "import": "(data)/bots/ai-robots-txt.yaml"
														
 
															+      "import": "(data)/bots/ai-catchall.yaml"
														
 
															     },
														
 
															     {
														
 
															       "import": "(data)/bots/cloudflare-workers.yaml"
														
@@ -29,8 +29,8 @@ EG:
 
															 ```yaml
														
 
															 bots:
														
 
															   # Pathological bots to deny
														
 
															-  - # This correlates to data/bots/ai-robots-txt.yaml in the source tree
														
 
															-    import: (data)/bots/ai-robots-txt.yaml
														
 
															+  - # This correlates to data/bots/ai-catchall.yaml in the source tree
														
 
															+    import: (data)/bots/ai-catchall.yaml
														
 
															   - import: (data)/bots/cloudflare-workers.yaml
														
 
															 ```
														
@@ -46,7 +46,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
 
															 {
														
 
															   "bots": [
														
 
															     {
														
 
															-      "import": "(data)/bots/ai-robots-txt.yaml",
														
 
															+      "import": "(data)/bots/ai-catchall.yaml",
														
 
															       "name": "generic-browser",
														
 
															       "user_agent_regex": "Mozilla|Opera\n",
														
 
															       "action": "CHALLENGE"
														
@@ -60,7 +60,7 @@ Of note, a bot rule can either have inline bot configuration or import a bot con
 
															 ```yaml
														
 
															 bots:
														
 
															-  - import: (data)/bots/ai-robots-txt.yaml
														
 
															+  - import: (data)/bots/ai-catchall.yaml
														
 
															     name: generic-browser
														
 
															     user_agent_regex: >
														
 
															       Mozilla|Opera
														
@@ -167,7 +167,7 @@ static
 
															 ├── botPolicies.json
														
 
															 ├── botPolicies.yaml
														
 
															 ├── bots
														
 
															-│   ├── ai-robots-txt.yaml
														
 
															+│   ├── ai-catchall.yaml
														
 
															 │   ├── cloudflare-workers.yaml
														
 
															 │   ├── headless-browsers.yaml
														
 
															 │   └── us-ai-scraper.yaml
														
--- a/lib/policy/config/config_test.go
+++ b/lib/policy/config/config_test.go
@@ -251,6 +251,7 @@ func TestImportStatement(t *testing.T) {
 
															 		"bots",
														
 
															 		"common",
														
 
															 		"crawlers",
														
 
															+		"meta",
														
 
															 	} {
														
 
															 		if err := fs.WalkDir(data.BotPolicies, folderName, func(path string, d fs.DirEntry, err error) error {
														
 
															 			if err != nil {
														
@@ -259,6 +260,9 @@ func TestImportStatement(t *testing.T) {
 
															 			if d.IsDir() {
														
 
															 				return nil
														
 
															 			}
														
 
															+			if d.Name() == "README.md" {
														
 
															+				return nil
														
 
															+			}
														
 
															 			tests = append(tests, testCase{
														
 
															 				name:       "(data)/" + path,
														
--- a/lib/policy/config/testdata/bad/import_and_bot.json
+++ b/lib/policy/config/testdata/bad/import_and_bot.json
@@ -1,7 +1,7 @@
 
															 {
														
 
															   "bots": [
														
 
															     {
														
 
															-      "import": "(data)/bots/ai-robots-txt.yaml",
														
 
															+      "import": "(data)/bots/ai-catchall.yaml",
														
 
															       "name": "generic-browser",
														
 
															       "user_agent_regex": "Mozilla|Opera\n",
														
 
															       "action": "CHALLENGE"
														
--- a/lib/policy/config/testdata/bad/import_and_bot.yaml
+++ b/lib/policy/config/testdata/bad/import_and_bot.yaml
@@ -1,5 +1,5 @@
 
															 bots:
														
 
															-- import: (data)/bots/ai-robots-txt.yaml
														
 
															+- import: (data)/bots/ai-catchall.yaml
														
 
															   name: generic-browser
														
 
															   user_agent_regex: >
														
 
															     Mozilla|Opera
														
--- a/yeetfile.js
+++ b/yeetfile.js
@@ -35,6 +35,7 @@ $`npm run assets`;
 
															             $`cp -a data/clients ${doc}/data/clients`;
														
 
															             $`cp -a data/common ${doc}/data/common`;
														
 
															             $`cp -a data/crawlers ${doc}/data/crawlers`;
														
 
															+            $`cp -a data/meta ${doc}/data/meta`;
														
 
															         },
														
 
															     }));
														
 
															 });