1 周之前 · e0781e4560
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -12,6 +12,7 @@ archlinux
 
				 badregexes
			
 
				 bdba
			
 
				 berr
			
 
				+betteralign
			
 
				 bingbot
			
 
				 bitcoin
			
 
				 blogging
			
@@ -96,6 +97,7 @@ gomod
 
				 goodbot
			
 
				 googlebot
			
 
				 govulncheck
			
 
				+goyaml
			
 
				 GPG
			
 
				 GPT
			
 
				 gptbot
			
@@ -162,6 +164,7 @@ mojeekbot
 
				 mozilla
			
 
				 nbf
			
 
				 netsurf
			
 
				+NFlag
			
 
				 nginx
			
 
				 nobots
			
 
				 NONINFRINGEMENT
			
@@ -217,6 +220,7 @@ sebest
 
				 secretplans
			
 
				 selfsigned
			
 
				 Semrush
			
 
				+Seo
			
 
				 setsebool
			
 
				 shellcheck
			
 
				 Sidetrade
			
--- a/cmd/robots2policy/batch/batch_process.go
+++ b/cmd/robots2policy/batch/batch_process.go
@@ -0,0 +1,78 @@
 
				+/*
			
 
				+Batch process robots.txt files from archives like https://github.com/nrjones8/robots-dot-txt-archive-bot/tree/master/data/cleaned
			
 
				+into Anubis CEL policies. Usage: go run batch_process.go <directory with robots.txt files>
			
 
				+*/
			
 
				+package main
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+	"io/fs"
			
 
				+	"log"
			
 
				+	"os"
			
 
				+	"os/exec"
			
 
				+	"path/filepath"
			
 
				+	"strings"
			
 
				+)
			
 
				+
			
 
				+func main() {
			
 
				+	if len(os.Args) < 2 {
			
 
				+		fmt.Println("Usage: go run batch_process.go <cleaned_directory>")
			
 
				+		fmt.Println("Example: go run batch_process.go ./cleaned")
			
 
				+		os.Exit(1)
			
 
				+	}
			
 
				+
			
 
				+	cleanedDir := os.Args[1]
			
 
				+	outputDir := "generated_policies"
			
 
				+
			
 
				+	// Create output directory
			
 
				+	if err := os.MkdirAll(outputDir, 0755); err != nil {
			
 
				+		log.Fatalf("Failed to create output directory: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	count := 0
			
 
				+	err := filepath.WalkDir(cleanedDir, func(path string, d fs.DirEntry, err error) error {
			
 
				+		if err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+
			
 
				+		// Skip directories
			
 
				+		if d.IsDir() {
			
 
				+			return nil
			
 
				+		}
			
 
				+
			
 
				+		// Generate policy name from file path
			
 
				+		relPath, _ := filepath.Rel(cleanedDir, path)
			
 
				+		policyName := strings.ReplaceAll(relPath, "/", "-")
			
 
				+		policyName = strings.TrimSuffix(policyName, "-robots.txt")
			
 
				+		policyName = strings.ReplaceAll(policyName, ".", "-")
			
 
				+
			
 
				+		outputFile := filepath.Join(outputDir, policyName+".yaml")
			
 
				+
			
 
				+		cmd := exec.Command("go", "run", "main.go",
			
 
				+			"-input", path,
			
 
				+			"-output", outputFile,
			
 
				+			"-name", policyName,
			
 
				+			"-format", "yaml")
			
 
				+
			
 
				+		if err := cmd.Run(); err != nil {
			
 
				+			fmt.Printf("Warning: Failed to process %s: %v\n", path, err)
			
 
				+			return nil // Continue processing other files
			
 
				+		}
			
 
				+
			
 
				+		count++
			
 
				+		if count%100 == 0 {
			
 
				+			fmt.Printf("Processed %d files...\n", count)
			
 
				+		} else if count%10 == 0 {
			
 
				+			fmt.Print(".")
			
 
				+		}
			
 
				+
			
 
				+		return nil
			
 
				+	})
			
 
				+
			
 
				+	if err != nil {
			
 
				+		log.Fatalf("Error walking directory: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	fmt.Printf("Successfully processed %d robots.txt files\n", count)
			
 
				+	fmt.Printf("Generated policies saved to: %s/\n", outputDir)
			
 
				+}
			
--- a/cmd/robots2policy/main.go
+++ b/cmd/robots2policy/main.go
@@ -0,0 +1,313 @@
 
				+package main
			
 
				+
			
 
				+import (
			
 
				+	"bufio"
			
 
				+	"encoding/json"
			
 
				+	"flag"
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+	"log"
			
 
				+	"net/http"
			
 
				+	"os"
			
 
				+	"regexp"
			
 
				+	"strings"
			
 
				+
			
 
				+	"github.com/TecharoHQ/anubis/lib/policy/config"
			
 
				+
			
 
				+	"sigs.k8s.io/yaml"
			
 
				+)
			
 
				+
			
 
				+var (
			
 
				+	inputFile     = flag.String("input", "", "path to robots.txt file (use - for stdin)")
			
 
				+	outputFile    = flag.String("output", "", "output file path (use - for stdout, defaults to stdout)")
			
 
				+	outputFormat  = flag.String("format", "yaml", "output format: yaml or json")
			
 
				+	baseAction    = flag.String("action", "CHALLENGE", "default action for disallowed paths: ALLOW, DENY, CHALLENGE, WEIGH")
			
 
				+	crawlDelay    = flag.Int("crawl-delay-weight", 0, "if > 0, add weight adjustment for crawl-delay (difficulty adjustment)")
			
 
				+	policyName    = flag.String("name", "robots-txt-policy", "name for the generated policy")
			
 
				+	userAgentDeny = flag.String("deny-user-agents", "DENY", "action for specifically blocked user agents: DENY, CHALLENGE")
			
 
				+	helpFlag      = flag.Bool("help", false, "show help")
			
 
				+)
			
 
				+
			
 
				+type RobotsRule struct {
			
 
				+	UserAgent   string
			
 
				+	Disallows   []string
			
 
				+	Allows      []string
			
 
				+	CrawlDelay  int
			
 
				+	IsBlacklist bool // true if this is a specifically denied user agent
			
 
				+}
			
 
				+
			
 
				+type AnubisRule struct {
			
 
				+	Expression *config.ExpressionOrList `yaml:"expression,omitempty" json:"expression,omitempty"`
			
 
				+	Challenge  *config.ChallengeRules   `yaml:"challenge,omitempty" json:"challenge,omitempty"`
			
 
				+	Weight     *config.Weight           `yaml:"weight,omitempty" json:"weight,omitempty"`
			
 
				+	Name       string                   `yaml:"name" json:"name"`
			
 
				+	Action     string                   `yaml:"action" json:"action"`
			
 
				+}
			
 
				+
			
 
				+func init() {
			
 
				+	flag.Usage = func() {
			
 
				+		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
			
 
				+		fmt.Fprintf(os.Stderr, "%s [options] -input <robots.txt>\n\n", os.Args[0])
			
 
				+		flag.PrintDefaults()
			
 
				+		fmt.Fprintln(os.Stderr, "\nExamples:")
			
 
				+		fmt.Fprintln(os.Stderr, "  # Convert local robots.txt file")
			
 
				+		fmt.Fprintln(os.Stderr, "  robots2policy -input robots.txt -output policy.yaml")
			
 
				+		fmt.Fprintln(os.Stderr, "")
			
 
				+		fmt.Fprintln(os.Stderr, "  # Convert from URL")
			
 
				+		fmt.Fprintln(os.Stderr, "  robots2policy -input https://example.com/robots.txt -format json")
			
 
				+		fmt.Fprintln(os.Stderr, "")
			
 
				+		fmt.Fprintln(os.Stderr, "  # Read from stdin, write to stdout")
			
 
				+		fmt.Fprintln(os.Stderr, "  curl https://example.com/robots.txt | robots2policy -input -")
			
 
				+		os.Exit(2)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func main() {
			
 
				+	flag.Parse()
			
 
				+
			
 
				+	if len(flag.Args()) > 0 || *helpFlag || *inputFile == "" {
			
 
				+		flag.Usage()
			
 
				+	}
			
 
				+
			
 
				+	// Read robots.txt
			
 
				+	var input io.Reader
			
 
				+	if *inputFile == "-" {
			
 
				+		input = os.Stdin
			
 
				+	} else if strings.HasPrefix(*inputFile, "http://") || strings.HasPrefix(*inputFile, "https://") {
			
 
				+		resp, err := http.Get(*inputFile)
			
 
				+		if err != nil {
			
 
				+			log.Fatalf("failed to fetch robots.txt from URL: %v", err)
			
 
				+		}
			
 
				+		defer resp.Body.Close()
			
 
				+		input = resp.Body
			
 
				+	} else {
			
 
				+		file, err := os.Open(*inputFile)
			
 
				+		if err != nil {
			
 
				+			log.Fatalf("failed to open input file: %v", err)
			
 
				+		}
			
 
				+		defer file.Close()
			
 
				+		input = file
			
 
				+	}
			
 
				+
			
 
				+	// Parse robots.txt
			
 
				+	rules, err := parseRobotsTxt(input)
			
 
				+	if err != nil {
			
 
				+		log.Fatalf("failed to parse robots.txt: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	// Convert to Anubis rules
			
 
				+	anubisRules := convertToAnubisRules(rules)
			
 
				+
			
 
				+	// Check if any rules were generated
			
 
				+	if len(anubisRules) == 0 {
			
 
				+		log.Fatal("no valid rules generated from robots.txt - file may be empty or contain no disallow directives")
			
 
				+	}
			
 
				+
			
 
				+	// Generate output
			
 
				+	var output []byte
			
 
				+	switch strings.ToLower(*outputFormat) {
			
 
				+	case "yaml":
			
 
				+		output, err = yaml.Marshal(anubisRules)
			
 
				+	case "json":
			
 
				+		output, err = json.MarshalIndent(anubisRules, "", "  ")
			
 
				+	default:
			
 
				+		log.Fatalf("unsupported output format: %s (use yaml or json)", *outputFormat)
			
 
				+	}
			
 
				+
			
 
				+	if err != nil {
			
 
				+		log.Fatalf("failed to marshal output: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	// Write output
			
 
				+	if *outputFile == "" || *outputFile == "-" {
			
 
				+		fmt.Print(string(output))
			
 
				+	} else {
			
 
				+		err = os.WriteFile(*outputFile, output, 0644)
			
 
				+		if err != nil {
			
 
				+			log.Fatalf("failed to write output file: %v", err)
			
 
				+		}
			
 
				+		fmt.Printf("Generated Anubis policy written to %s\n", *outputFile)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
			
 
				+	scanner := bufio.NewScanner(input)
			
 
				+	var rules []RobotsRule
			
 
				+	var currentRule *RobotsRule
			
 
				+
			
 
				+	for scanner.Scan() {
			
 
				+		line := strings.TrimSpace(scanner.Text())
			
 
				+
			
 
				+		// Skip empty lines and comments
			
 
				+		if line == "" || strings.HasPrefix(line, "#") {
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// Split on first colon
			
 
				+		parts := strings.SplitN(line, ":", 2)
			
 
				+		if len(parts) != 2 {
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		directive := strings.TrimSpace(strings.ToLower(parts[0]))
			
 
				+		value := strings.TrimSpace(parts[1])
			
 
				+
			
 
				+		switch directive {
			
 
				+		case "user-agent":
			
 
				+			// Start a new rule section
			
 
				+			if currentRule != nil {
			
 
				+				rules = append(rules, *currentRule)
			
 
				+			}
			
 
				+			currentRule = &RobotsRule{
			
 
				+				UserAgent: value,
			
 
				+				Disallows: make([]string, 0),
			
 
				+				Allows:    make([]string, 0),
			
 
				+			}
			
 
				+
			
 
				+		case "disallow":
			
 
				+			if currentRule != nil && value != "" {
			
 
				+				currentRule.Disallows = append(currentRule.Disallows, value)
			
 
				+			}
			
 
				+
			
 
				+		case "allow":
			
 
				+			if currentRule != nil && value != "" {
			
 
				+				currentRule.Allows = append(currentRule.Allows, value)
			
 
				+			}
			
 
				+
			
 
				+		case "crawl-delay":
			
 
				+			if currentRule != nil {
			
 
				+				if delay, err := parseIntSafe(value); err == nil {
			
 
				+					currentRule.CrawlDelay = delay
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Don't forget the last rule
			
 
				+	if currentRule != nil {
			
 
				+		rules = append(rules, *currentRule)
			
 
				+	}
			
 
				+
			
 
				+	// Mark blacklisted user agents (those with "Disallow: /")
			
 
				+	for i := range rules {
			
 
				+		for _, disallow := range rules[i].Disallows {
			
 
				+			if disallow == "/" {
			
 
				+				rules[i].IsBlacklist = true
			
 
				+				break
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return rules, scanner.Err()
			
 
				+}
			
 
				+
			
 
				+func parseIntSafe(s string) (int, error) {
			
 
				+	var result int
			
 
				+	_, err := fmt.Sscanf(s, "%d", &result)
			
 
				+	return result, err
			
 
				+}
			
 
				+
			
 
				+func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
			
 
				+	var anubisRules []AnubisRule
			
 
				+	ruleCounter := 0
			
 
				+
			
 
				+	for _, robotsRule := range robotsRules {
			
 
				+		userAgent := robotsRule.UserAgent
			
 
				+
			
 
				+		// Handle crawl delay as weight adjustment (do this first before any continues)
			
 
				+		if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
			
 
				+			ruleCounter++
			
 
				+			rule := AnubisRule{
			
 
				+				Name:   fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
			
 
				+				Action: "WEIGH",
			
 
				+				Weight: &config.Weight{Adjust: *crawlDelay},
			
 
				+			}
			
 
				+
			
 
				+			if userAgent == "*" {
			
 
				+				rule.Expression = &config.ExpressionOrList{
			
 
				+					All: []string{"true"}, // Always applies
			
 
				+				}
			
 
				+			} else {
			
 
				+				rule.Expression = &config.ExpressionOrList{
			
 
				+					All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			anubisRules = append(anubisRules, rule)
			
 
				+		}
			
 
				+
			
 
				+		// Handle blacklisted user agents (complete deny/challenge)
			
 
				+		if robotsRule.IsBlacklist {
			
 
				+			ruleCounter++
			
 
				+			rule := AnubisRule{
			
 
				+				Name:   fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
			
 
				+				Action: *userAgentDeny,
			
 
				+			}
			
 
				+
			
 
				+			if userAgent == "*" {
			
 
				+				// This would block everything - convert to a weight adjustment instead
			
 
				+				rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
			
 
				+				rule.Action = "WEIGH"
			
 
				+				rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
			
 
				+				rule.Expression = &config.ExpressionOrList{
			
 
				+					All: []string{"true"}, // Always applies
			
 
				+				}
			
 
				+			} else {
			
 
				+				rule.Expression = &config.ExpressionOrList{
			
 
				+					All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
			
 
				+				}
			
 
				+			}
			
 
				+			anubisRules = append(anubisRules, rule)
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// Handle specific disallow rules
			
 
				+		for _, disallow := range robotsRule.Disallows {
			
 
				+			if disallow == "/" {
			
 
				+				continue // Already handled as blacklist above
			
 
				+			}
			
 
				+
			
 
				+			ruleCounter++
			
 
				+			rule := AnubisRule{
			
 
				+				Name:   fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
			
 
				+				Action: *baseAction,
			
 
				+			}
			
 
				+
			
 
				+			// Build CEL expression
			
 
				+			var conditions []string
			
 
				+
			
 
				+			// Add user agent condition if not wildcard
			
 
				+			if userAgent != "*" {
			
 
				+				conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgent))
			
 
				+			}
			
 
				+
			
 
				+			// Add path condition
			
 
				+			pathCondition := buildPathCondition(disallow)
			
 
				+			conditions = append(conditions, pathCondition)
			
 
				+
			
 
				+			rule.Expression = &config.ExpressionOrList{
			
 
				+				All: conditions,
			
 
				+			}
			
 
				+
			
 
				+			anubisRules = append(anubisRules, rule)
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	return anubisRules
			
 
				+}
			
 
				+
			
 
				+func buildPathCondition(robotsPath string) string {
			
 
				+	// Handle wildcards in robots.txt paths
			
 
				+	if strings.Contains(robotsPath, "*") || strings.Contains(robotsPath, "?") {
			
 
				+		// Convert robots.txt wildcards to regex
			
 
				+		regex := regexp.QuoteMeta(robotsPath)
			
 
				+		regex = strings.ReplaceAll(regex, `\*`, `.*`) // * becomes .*
			
 
				+		regex = strings.ReplaceAll(regex, `\?`, `.`)  // ? becomes .
			
 
				+		regex = "^" + regex
			
 
				+		return fmt.Sprintf("path.matches(%q)", regex)
			
 
				+	}
			
 
				+
			
 
				+	// Simple prefix match for most cases
			
 
				+	return fmt.Sprintf("path.startsWith(%q)", robotsPath)
			
 
				+}
			
--- a/cmd/robots2policy/robots2policy_test.go
+++ b/cmd/robots2policy/robots2policy_test.go
@@ -0,0 +1,418 @@
 
				+package main
			
 
				+
			
 
				+import (
			
 
				+	"encoding/json"
			
 
				+	"fmt"
			
 
				+	"os"
			
 
				+	"path/filepath"
			
 
				+	"reflect"
			
 
				+	"strings"
			
 
				+	"testing"
			
 
				+
			
 
				+	"gopkg.in/yaml.v3"
			
 
				+)
			
 
				+
			
 
				+type TestCase struct {
			
 
				+	name         string
			
 
				+	robotsFile   string
			
 
				+	expectedFile string
			
 
				+	options      TestOptions
			
 
				+}
			
 
				+
			
 
				+type TestOptions struct {
			
 
				+	format           string
			
 
				+	action           string
			
 
				+	crawlDelayWeight int
			
 
				+	policyName       string
			
 
				+	deniedAction     string
			
 
				+}
			
 
				+
			
 
				+func TestDataFileConversion(t *testing.T) {
			
 
				+
			
 
				+	testCases := []TestCase{
			
 
				+		{
			
 
				+			name:         "simple_default",
			
 
				+			robotsFile:   "simple.robots.txt",
			
 
				+			expectedFile: "simple.yaml",
			
 
				+			options:      TestOptions{format: "yaml"},
			
 
				+		},
			
 
				+		{
			
 
				+			name:         "simple_json",
			
 
				+			robotsFile:   "simple.robots.txt",
			
 
				+			expectedFile: "simple.json",
			
 
				+			options:      TestOptions{format: "json"},
			
 
				+		},
			
 
				+		{
			
 
				+			name:         "simple_deny_action",
			
 
				+			robotsFile:   "simple.robots.txt",
			
 
				+			expectedFile: "deny-action.yaml",
			
 
				+			options:      TestOptions{format: "yaml", action: "DENY"},
			
 
				+		},
			
 
				+		{
			
 
				+			name:         "simple_custom_name",
			
 
				+			robotsFile:   "simple.robots.txt",
			
 
				+			expectedFile: "custom-name.yaml",
			
 
				+			options:      TestOptions{format: "yaml", policyName: "my-custom-policy"},
			
 
				+		},
			
 
				+		{
			
 
				+			name:         "blacklist_with_crawl_delay",
			
 
				+			robotsFile:   "blacklist.robots.txt",
			
 
				+			expectedFile: "blacklist.yaml",
			
 
				+			options:      TestOptions{format: "yaml", crawlDelayWeight: 3},
			
 
				+		},
			
 
				+		{
			
 
				+			name:         "wildcards",
			
 
				+			robotsFile:   "wildcards.robots.txt",
			
 
				+			expectedFile: "wildcards.yaml",
			
 
				+			options:      TestOptions{format: "yaml"},
			
 
				+		},
			
 
				+		{
			
 
				+			name:         "empty_file",
			
 
				+			robotsFile:   "empty.robots.txt",
			
 
				+			expectedFile: "empty.yaml",
			
 
				+			options:      TestOptions{format: "yaml"},
			
 
				+		},
			
 
				+		{
			
 
				+			name:         "complex_scenario",
			
 
				+			robotsFile:   "complex.robots.txt",
			
 
				+			expectedFile: "complex.yaml",
			
 
				+			options:      TestOptions{format: "yaml", crawlDelayWeight: 5},
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tc := range testCases {
			
 
				+		t.Run(tc.name, func(t *testing.T) {
			
 
				+			robotsPath := filepath.Join("testdata", tc.robotsFile)
			
 
				+			expectedPath := filepath.Join("testdata", tc.expectedFile)
			
 
				+
			
 
				+			// Read robots.txt input
			
 
				+			robotsFile, err := os.Open(robotsPath)
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to open robots file %s: %v", robotsPath, err)
			
 
				+			}
			
 
				+			defer robotsFile.Close()
			
 
				+
			
 
				+			// Parse robots.txt
			
 
				+			rules, err := parseRobotsTxt(robotsFile)
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse robots.txt: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			// Set test options
			
 
				+			oldFormat := *outputFormat
			
 
				+			oldAction := *baseAction
			
 
				+			oldCrawlDelay := *crawlDelay
			
 
				+			oldPolicyName := *policyName
			
 
				+			oldDeniedAction := *userAgentDeny
			
 
				+
			
 
				+			if tc.options.format != "" {
			
 
				+				*outputFormat = tc.options.format
			
 
				+			}
			
 
				+			if tc.options.action != "" {
			
 
				+				*baseAction = tc.options.action
			
 
				+			}
			
 
				+			if tc.options.crawlDelayWeight > 0 {
			
 
				+				*crawlDelay = tc.options.crawlDelayWeight
			
 
				+			}
			
 
				+			if tc.options.policyName != "" {
			
 
				+				*policyName = tc.options.policyName
			
 
				+			}
			
 
				+			if tc.options.deniedAction != "" {
			
 
				+				*userAgentDeny = tc.options.deniedAction
			
 
				+			}
			
 
				+
			
 
				+			// Restore options after test
			
 
				+			defer func() {
			
 
				+				*outputFormat = oldFormat
			
 
				+				*baseAction = oldAction
			
 
				+				*crawlDelay = oldCrawlDelay
			
 
				+				*policyName = oldPolicyName
			
 
				+				*userAgentDeny = oldDeniedAction
			
 
				+			}()
			
 
				+
			
 
				+			// Convert to Anubis rules
			
 
				+			anubisRules := convertToAnubisRules(rules)
			
 
				+
			
 
				+			// Generate output
			
 
				+			var actualOutput []byte
			
 
				+			switch strings.ToLower(*outputFormat) {
			
 
				+			case "yaml":
			
 
				+				actualOutput, err = yaml.Marshal(anubisRules)
			
 
				+			case "json":
			
 
				+				actualOutput, err = json.MarshalIndent(anubisRules, "", "  ")
			
 
				+			}
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to marshal output: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			// Read expected output
			
 
				+			expectedOutput, err := os.ReadFile(expectedPath)
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to read expected file %s: %v", expectedPath, err)
			
 
				+			}
			
 
				+
			
 
				+			if strings.ToLower(*outputFormat) == "yaml" {
			
 
				+				var actualData []interface{}
			
 
				+				var expectedData []interface{}
			
 
				+
			
 
				+				err = yaml.Unmarshal(actualOutput, &actualData)
			
 
				+				if err != nil {
			
 
				+					t.Fatalf("Failed to unmarshal actual output: %v", err)
			
 
				+				}
			
 
				+
			
 
				+				err = yaml.Unmarshal(expectedOutput, &expectedData)
			
 
				+				if err != nil {
			
 
				+					t.Fatalf("Failed to unmarshal expected output: %v", err)
			
 
				+				}
			
 
				+
			
 
				+				// Compare data structures
			
 
				+				if !compareData(actualData, expectedData) {
			
 
				+					actualStr := strings.TrimSpace(string(actualOutput))
			
 
				+					expectedStr := strings.TrimSpace(string(expectedOutput))
			
 
				+					t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
			
 
				+				}
			
 
				+			} else {
			
 
				+				var actualData []interface{}
			
 
				+				var expectedData []interface{}
			
 
				+
			
 
				+				err = json.Unmarshal(actualOutput, &actualData)
			
 
				+				if err != nil {
			
 
				+					t.Fatalf("Failed to unmarshal actual JSON output: %v", err)
			
 
				+				}
			
 
				+
			
 
				+				err = json.Unmarshal(expectedOutput, &expectedData)
			
 
				+				if err != nil {
			
 
				+					t.Fatalf("Failed to unmarshal expected JSON output: %v", err)
			
 
				+				}
			
 
				+
			
 
				+				// Compare data structures
			
 
				+				if !compareData(actualData, expectedData) {
			
 
				+					actualStr := strings.TrimSpace(string(actualOutput))
			
 
				+					expectedStr := strings.TrimSpace(string(expectedOutput))
			
 
				+					t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
			
 
				+				}
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestCaseInsensitiveParsing(t *testing.T) {
			
 
				+	robotsTxt := `User-Agent: *
			
 
				+Disallow: /admin
			
 
				+Crawl-Delay: 10
			
 
				+
			
 
				+User-agent: TestBot
			
 
				+disallow: /test
			
 
				+crawl-delay: 5
			
 
				+
			
 
				+USER-AGENT: UpperBot
			
 
				+DISALLOW: /upper
			
 
				+CRAWL-DELAY: 20`
			
 
				+
			
 
				+	reader := strings.NewReader(robotsTxt)
			
 
				+	rules, err := parseRobotsTxt(reader)
			
 
				+	if err != nil {
			
 
				+		t.Fatalf("Failed to parse case-insensitive robots.txt: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	expectedRules := 3
			
 
				+	if len(rules) != expectedRules {
			
 
				+		t.Errorf("Expected %d rules, got %d", expectedRules, len(rules))
			
 
				+	}
			
 
				+
			
 
				+	// Check that all crawl delays were parsed
			
 
				+	for i, rule := range rules {
			
 
				+		expectedDelays := []int{10, 5, 20}
			
 
				+		if rule.CrawlDelay != expectedDelays[i] {
			
 
				+			t.Errorf("Rule %d: expected crawl delay %d, got %d", i, expectedDelays[i], rule.CrawlDelay)
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestVariousOutputFormats(t *testing.T) {
			
 
				+	robotsTxt := `User-agent: *
			
 
				+Disallow: /admin`
			
 
				+
			
 
				+	reader := strings.NewReader(robotsTxt)
			
 
				+	rules, err := parseRobotsTxt(reader)
			
 
				+	if err != nil {
			
 
				+		t.Fatalf("Failed to parse robots.txt: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	oldPolicyName := *policyName
			
 
				+	*policyName = "test-policy"
			
 
				+	defer func() { *policyName = oldPolicyName }()
			
 
				+
			
 
				+	anubisRules := convertToAnubisRules(rules)
			
 
				+
			
 
				+	// Test YAML output
			
 
				+	yamlOutput, err := yaml.Marshal(anubisRules)
			
 
				+	if err != nil {
			
 
				+		t.Fatalf("Failed to marshal YAML: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	if !strings.Contains(string(yamlOutput), "name: test-policy-disallow-1") {
			
 
				+		t.Errorf("YAML output doesn't contain expected rule name")
			
 
				+	}
			
 
				+
			
 
				+	// Test JSON output
			
 
				+	jsonOutput, err := json.MarshalIndent(anubisRules, "", "  ")
			
 
				+	if err != nil {
			
 
				+		t.Fatalf("Failed to marshal JSON: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	if !strings.Contains(string(jsonOutput), `"name": "test-policy-disallow-1"`) {
			
 
				+		t.Errorf("JSON output doesn't contain expected rule name")
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestDifferentActions(t *testing.T) {
			
 
				+	robotsTxt := `User-agent: *
			
 
				+Disallow: /admin`
			
 
				+
			
 
				+	testActions := []string{"ALLOW", "DENY", "CHALLENGE", "WEIGH"}
			
 
				+
			
 
				+	for _, action := range testActions {
			
 
				+		t.Run("action_"+action, func(t *testing.T) {
			
 
				+			reader := strings.NewReader(robotsTxt)
			
 
				+			rules, err := parseRobotsTxt(reader)
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse robots.txt: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			oldAction := *baseAction
			
 
				+			*baseAction = action
			
 
				+			defer func() { *baseAction = oldAction }()
			
 
				+
			
 
				+			anubisRules := convertToAnubisRules(rules)
			
 
				+
			
 
				+			if len(anubisRules) != 1 {
			
 
				+				t.Fatalf("Expected 1 rule, got %d", len(anubisRules))
			
 
				+			}
			
 
				+
			
 
				+			if anubisRules[0].Action != action {
			
 
				+				t.Errorf("Expected action %s, got %s", action, anubisRules[0].Action)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestPolicyNaming(t *testing.T) {
			
 
				+	robotsTxt := `User-agent: *
			
 
				+Disallow: /admin
			
 
				+Disallow: /private
			
 
				+
			
 
				+User-agent: BadBot
			
 
				+Disallow: /`
			
 
				+
			
 
				+	testNames := []string{"custom-policy", "my-rules", "site-protection"}
			
 
				+
			
 
				+	for _, name := range testNames {
			
 
				+		t.Run("name_"+name, func(t *testing.T) {
			
 
				+			reader := strings.NewReader(robotsTxt)
			
 
				+			rules, err := parseRobotsTxt(reader)
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse robots.txt: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			oldName := *policyName
			
 
				+			*policyName = name
			
 
				+			defer func() { *policyName = oldName }()
			
 
				+
			
 
				+			anubisRules := convertToAnubisRules(rules)
			
 
				+
			
 
				+			// Check that all rule names use the custom prefix
			
 
				+			for _, rule := range anubisRules {
			
 
				+				if !strings.HasPrefix(rule.Name, name+"-") {
			
 
				+					t.Errorf("Rule name %s doesn't start with expected prefix %s-", rule.Name, name)
			
 
				+				}
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestCrawlDelayWeights(t *testing.T) {
			
 
				+	robotsTxt := `User-agent: *
			
 
				+Disallow: /admin
			
 
				+Crawl-delay: 10
			
 
				+
			
 
				+User-agent: SlowBot
			
 
				+Disallow: /slow
			
 
				+Crawl-delay: 60`
			
 
				+
			
 
				+	testWeights := []int{1, 5, 10, 25}
			
 
				+
			
 
				+	for _, weight := range testWeights {
			
 
				+		t.Run(fmt.Sprintf("weight_%d", weight), func(t *testing.T) {
			
 
				+			reader := strings.NewReader(robotsTxt)
			
 
				+			rules, err := parseRobotsTxt(reader)
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse robots.txt: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			oldWeight := *crawlDelay
			
 
				+			*crawlDelay = weight
			
 
				+			defer func() { *crawlDelay = oldWeight }()
			
 
				+
			
 
				+			anubisRules := convertToAnubisRules(rules)
			
 
				+
			
 
				+			// Count weight rules and verify they have correct weight
			
 
				+			weightRules := 0
			
 
				+			for _, rule := range anubisRules {
			
 
				+				if rule.Action == "WEIGH" && rule.Weight != nil {
			
 
				+					weightRules++
			
 
				+					if rule.Weight.Adjust != weight {
			
 
				+						t.Errorf("Expected weight %d, got %d", weight, rule.Weight.Adjust)
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			expectedWeightRules := 2 // One for *, one for SlowBot
			
 
				+			if weightRules != expectedWeightRules {
			
 
				+				t.Errorf("Expected %d weight rules, got %d", expectedWeightRules, weightRules)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestBlacklistActions(t *testing.T) {
			
 
				+	robotsTxt := `User-agent: BadBot
			
 
				+Disallow: /
			
 
				+
			
 
				+User-agent: SpamBot
			
 
				+Disallow: /`
			
 
				+
			
 
				+	testActions := []string{"DENY", "CHALLENGE"}
			
 
				+
			
 
				+	for _, action := range testActions {
			
 
				+		t.Run("blacklist_"+action, func(t *testing.T) {
			
 
				+			reader := strings.NewReader(robotsTxt)
			
 
				+			rules, err := parseRobotsTxt(reader)
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse robots.txt: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			oldAction := *userAgentDeny
			
 
				+			*userAgentDeny = action
			
 
				+			defer func() { *userAgentDeny = oldAction }()
			
 
				+
			
 
				+			anubisRules := convertToAnubisRules(rules)
			
 
				+
			
 
				+			// All rules should be blacklist rules with the specified action
			
 
				+			for _, rule := range anubisRules {
			
 
				+				if !strings.Contains(rule.Name, "blacklist") {
			
 
				+					t.Errorf("Expected blacklist rule, got %s", rule.Name)
			
 
				+				}
			
 
				+				if rule.Action != action {
			
 
				+					t.Errorf("Expected action %s, got %s", action, rule.Action)
			
 
				+				}
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// compareData performs a deep comparison of two data structures,
			
 
				+// ignoring differences that are semantically equivalent in YAML/JSON
			
 
				+func compareData(actual, expected interface{}) bool {
			
 
				+	return reflect.DeepEqual(actual, expected)
			
 
				+}
			
--- a/cmd/robots2policy/testdata/blacklist.robots.txt
+++ b/cmd/robots2policy/testdata/blacklist.robots.txt
@@ -0,0 +1,15 @@
 
				+# Test with blacklisted user agents
			
 
				+User-agent: *
			
 
				+Disallow: /admin
			
 
				+Crawl-delay: 10
			
 
				+
			
 
				+User-agent: BadBot
			
 
				+Disallow: /
			
 
				+
			
 
				+User-agent: SpamBot
			
 
				+Disallow: /
			
 
				+Crawl-delay: 60
			
 
				+
			
 
				+User-agent: Googlebot
			
 
				+Disallow: /search
			
 
				+Crawl-delay: 5
			
--- a/cmd/robots2policy/testdata/blacklist.yaml
+++ b/cmd/robots2policy/testdata/blacklist.yaml
@@ -0,0 +1,30 @@
 
				+- action: WEIGH
			
 
				+  expression: "true"
			
 
				+  name: robots-txt-policy-crawl-delay-1
			
 
				+  weight:
			
 
				+    adjust: 3
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/admin")
			
 
				+  name: robots-txt-policy-disallow-2
			
 
				+- action: DENY
			
 
				+  expression: userAgent.contains("BadBot")
			
 
				+  name: robots-txt-policy-blacklist-3
			
 
				+- action: WEIGH
			
 
				+  expression: userAgent.contains("SpamBot")
			
 
				+  name: robots-txt-policy-crawl-delay-4
			
 
				+  weight:
			
 
				+    adjust: 3
			
 
				+- action: DENY
			
 
				+  expression: userAgent.contains("SpamBot")
			
 
				+  name: robots-txt-policy-blacklist-5
			
 
				+- action: WEIGH
			
 
				+  expression: userAgent.contains("Googlebot")
			
 
				+  name: robots-txt-policy-crawl-delay-6
			
 
				+  weight:
			
 
				+    adjust: 3
			
 
				+- action: CHALLENGE
			
 
				+  expression:
			
 
				+    all:
			
 
				+        - userAgent.contains("Googlebot")
			
 
				+        - path.startsWith("/search")
			
 
				+  name: robots-txt-policy-disallow-7
			
--- a/cmd/robots2policy/testdata/complex.robots.txt
+++ b/cmd/robots2policy/testdata/complex.robots.txt
@@ -0,0 +1,30 @@
 
				+# Complex real-world example
			
 
				+User-agent: *
			
 
				+Disallow: /admin/
			
 
				+Disallow: /private/
			
 
				+Disallow: /api/internal/
			
 
				+Allow: /api/public/
			
 
				+Crawl-delay: 5
			
 
				+
			
 
				+User-agent: Googlebot
			
 
				+Disallow: /search/
			
 
				+Allow: /api/
			
 
				+Crawl-delay: 2
			
 
				+
			
 
				+User-agent: Bingbot
			
 
				+Disallow: /search/
			
 
				+Disallow: /admin/
			
 
				+Crawl-delay: 10
			
 
				+
			
 
				+User-agent: BadBot
			
 
				+Disallow: /
			
 
				+
			
 
				+User-agent: SeoBot
			
 
				+Disallow: /
			
 
				+Crawl-delay: 300
			
 
				+
			
 
				+# Test with various patterns
			
 
				+User-agent: TestBot
			
 
				+Disallow: /*/admin
			
 
				+Disallow: /temp*.html
			
 
				+Disallow: /file?.log
			
--- a/cmd/robots2policy/testdata/complex.yaml
+++ b/cmd/robots2policy/testdata/complex.yaml
@@ -0,0 +1,71 @@
 
				+- action: WEIGH
			
 
				+  expression: "true"
			
 
				+  name: robots-txt-policy-crawl-delay-1
			
 
				+  weight:
			
 
				+    adjust: 5
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/admin/")
			
 
				+  name: robots-txt-policy-disallow-2
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/private/")
			
 
				+  name: robots-txt-policy-disallow-3
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/api/internal/")
			
 
				+  name: robots-txt-policy-disallow-4
			
 
				+- action: WEIGH
			
 
				+  expression: userAgent.contains("Googlebot")
			
 
				+  name: robots-txt-policy-crawl-delay-5
			
 
				+  weight:
			
 
				+    adjust: 5
			
 
				+- action: CHALLENGE
			
 
				+  expression:
			
 
				+    all:
			
 
				+        - userAgent.contains("Googlebot")
			
 
				+        - path.startsWith("/search/")
			
 
				+  name: robots-txt-policy-disallow-6
			
 
				+- action: WEIGH
			
 
				+  expression: userAgent.contains("Bingbot")
			
 
				+  name: robots-txt-policy-crawl-delay-7
			
 
				+  weight:
			
 
				+    adjust: 5
			
 
				+- action: CHALLENGE
			
 
				+  expression:
			
 
				+    all:
			
 
				+        - userAgent.contains("Bingbot")
			
 
				+        - path.startsWith("/search/")
			
 
				+  name: robots-txt-policy-disallow-8
			
 
				+- action: CHALLENGE
			
 
				+  expression:
			
 
				+    all:
			
 
				+        - userAgent.contains("Bingbot")
			
 
				+        - path.startsWith("/admin/")
			
 
				+  name: robots-txt-policy-disallow-9
			
 
				+- action: DENY
			
 
				+  expression: userAgent.contains("BadBot")
			
 
				+  name: robots-txt-policy-blacklist-10
			
 
				+- action: WEIGH
			
 
				+  expression: userAgent.contains("SeoBot")
			
 
				+  name: robots-txt-policy-crawl-delay-11
			
 
				+  weight:
			
 
				+    adjust: 5
			
 
				+- action: DENY
			
 
				+  expression: userAgent.contains("SeoBot")
			
 
				+  name: robots-txt-policy-blacklist-12
			
 
				+- action: CHALLENGE
			
 
				+  expression:
			
 
				+    all:
			
 
				+        - userAgent.contains("TestBot")
			
 
				+        - path.matches("^/.*/admin")
			
 
				+  name: robots-txt-policy-disallow-13
			
 
				+- action: CHALLENGE
			
 
				+  expression:
			
 
				+    all:
			
 
				+        - userAgent.contains("TestBot")
			
 
				+        - path.matches("^/temp.*\\.html")
			
 
				+  name: robots-txt-policy-disallow-14
			
 
				+- action: CHALLENGE
			
 
				+  expression:
			
 
				+    all:
			
 
				+        - userAgent.contains("TestBot")
			
 
				+        - path.matches("^/file.\\.log")
			
 
				+  name: robots-txt-policy-disallow-15
			
--- a/cmd/robots2policy/testdata/custom-name.yaml
+++ b/cmd/robots2policy/testdata/custom-name.yaml
@@ -0,0 +1,6 @@
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/admin/")
			
 
				+  name: my-custom-policy-disallow-1
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/private")
			
 
				+  name: my-custom-policy-disallow-2
			
--- a/cmd/robots2policy/testdata/deny-action.yaml
+++ b/cmd/robots2policy/testdata/deny-action.yaml
@@ -0,0 +1,6 @@
 
				+- action: DENY
			
 
				+  expression: path.startsWith("/admin/")
			
 
				+  name: robots-txt-policy-disallow-1
			
 
				+- action: DENY
			
 
				+  expression: path.startsWith("/private")
			
 
				+  name: robots-txt-policy-disallow-2
			
--- a/cmd/robots2policy/testdata/empty.robots.txt
+++ b/cmd/robots2policy/testdata/empty.robots.txt
@@ -0,0 +1,2 @@
 
				+# Empty robots.txt (comments only)
			
 
				+# No actual rules
			
--- a/cmd/robots2policy/testdata/empty.yaml
+++ b/cmd/robots2policy/testdata/empty.yaml
@@ -0,0 +1 @@
 
				+[]
			
--- a/cmd/robots2policy/testdata/simple.json
+++ b/cmd/robots2policy/testdata/simple.json
@@ -0,0 +1,12 @@
 
				+[
			
 
				+  {
			
 
				+    "action": "CHALLENGE",
			
 
				+    "expression": "path.startsWith(\"/admin/\")",
			
 
				+    "name": "robots-txt-policy-disallow-1"
			
 
				+  },
			
 
				+  {
			
 
				+    "action": "CHALLENGE",
			
 
				+    "expression": "path.startsWith(\"/private\")",
			
 
				+    "name": "robots-txt-policy-disallow-2"
			
 
				+  }
			
 
				+]
			
--- a/cmd/robots2policy/testdata/simple.robots.txt
+++ b/cmd/robots2policy/testdata/simple.robots.txt
@@ -0,0 +1,5 @@
 
				+# Simple robots.txt test
			
 
				+User-agent: *
			
 
				+Disallow: /admin/
			
 
				+Disallow: /private
			
 
				+Allow: /public
			
--- a/cmd/robots2policy/testdata/simple.yaml
+++ b/cmd/robots2policy/testdata/simple.yaml
@@ -0,0 +1,6 @@
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/admin/")
			
 
				+  name: robots-txt-policy-disallow-1
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.startsWith("/private")
			
 
				+  name: robots-txt-policy-disallow-2
			
--- a/cmd/robots2policy/testdata/wildcards.robots.txt
+++ b/cmd/robots2policy/testdata/wildcards.robots.txt
@@ -0,0 +1,6 @@
 
				+# Test wildcard patterns
			
 
				+User-agent: *
			
 
				+Disallow: /search*
			
 
				+Disallow: /*/private
			
 
				+Disallow: /file?.txt
			
 
				+Disallow: /admin/*?action=delete
			
--- a/cmd/robots2policy/testdata/wildcards.yaml
+++ b/cmd/robots2policy/testdata/wildcards.yaml
@@ -0,0 +1,12 @@
 
				+- action: CHALLENGE
			
 
				+  expression: path.matches("^/search.*")
			
 
				+  name: robots-txt-policy-disallow-1
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.matches("^/.*/private")
			
 
				+  name: robots-txt-policy-disallow-2
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.matches("^/file.\\.txt")
			
 
				+  name: robots-txt-policy-disallow-3
			
 
				+- action: CHALLENGE
			
 
				+  expression: path.matches("^/admin/.*.action=delete")
			
 
				+  name: robots-txt-policy-disallow-4
			
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
				 - Make progress bar styling more compatible (UXP, etc)
			
 
				 - Optimized the OGTags subsystem with reduced allocations and runtime per request by up to 66%
			
 
				 - Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers
			
 
				+- Add `robots2policy` CLI utility to convert robots.txt files to Anubis challenge policies using CEL expressions ([#409](https://github.com/TecharoHQ/anubis/issues/409))
			
 
				 
			
 
				 ## v1.19.1: Jenomis cen Lexentale - Echo 1
			
 
				 
			
--- a/docs/docs/admin/robots2policy.mdx
+++ b/docs/docs/admin/robots2policy.mdx
@@ -0,0 +1,84 @@
 
				+---
			
 
				+title: robots2policy CLI Tool
			
 
				+sidebar_position: 50
			
 
				+---
			
 
				+
			
 
				+The `robots2policy` tool converts robots.txt files into Anubis challenge policies. It reads robots.txt rules and generates equivalent CEL expressions for path matching and user-agent filtering.
			
 
				+
			
 
				+## Installation
			
 
				+
			
 
				+Install directly with Go:
			
 
				+
			
 
				+```bash
			
 
				+go install github.com/TecharoHQ/anubis/cmd/robots2policy@latest
			
 
				+```
			
 
				+## Usage
			
 
				+
			
 
				+Basic conversion from URL:
			
 
				+
			
 
				+```bash
			
 
				+robots2policy -input https://www.example.com/robots.txt
			
 
				+```
			
 
				+
			
 
				+Convert local file to YAML:
			
 
				+
			
 
				+```bash
			
 
				+robots2policy -input robots.txt -output policy.yaml
			
 
				+```
			
 
				+
			
 
				+Convert with custom settings:
			
 
				+
			
 
				+```bash
			
 
				+robots2policy -input robots.txt -action DENY -format json
			
 
				+```
			
 
				+
			
 
				+## Options
			
 
				+
			
 
				+| Flag                  | Description                                                        | Default             |
			
 
				+|-----------------------|--------------------------------------------------------------------|---------------------|
			
 
				+| `-input`              | robots.txt file path or URL (use `-` for stdin)                    | *required*          |
			
 
				+| `-output`             | Output file (use `-` for stdout)                                   | stdout              |
			
 
				+| `-format`             | Output format: `yaml` or `json`                                    | `yaml`              |
			
 
				+| `-action`             | Action for disallowed paths: `ALLOW`, `DENY`, `CHALLENGE`, `WEIGH` | `CHALLENGE`         |
			
 
				+| `-name`               | Policy name prefix                                                 | `robots-txt-policy` |
			
 
				+| `-crawl-delay-weight` | Weight adjustment for crawl-delay rules                            | `3`                 |
			
 
				+| `-deny-user-agents`   | Action for blacklisted user agents                                 | `DENY`              |
			
 
				+
			
 
				+## Example
			
 
				+
			
 
				+Input robots.txt:
			
 
				+```txt
			
 
				+User-agent: *
			
 
				+Disallow: /admin/
			
 
				+Disallow: /private
			
 
				+
			
 
				+User-agent: BadBot
			
 
				+Disallow: /
			
 
				+```
			
 
				+
			
 
				+Generated policy:
			
 
				+```yaml
			
 
				+- name: robots-txt-policy-disallow-1
			
 
				+  action: CHALLENGE
			
 
				+  expression:
			
 
				+    single: path.startsWith("/admin/")
			
 
				+- name: robots-txt-policy-disallow-2
			
 
				+  action: CHALLENGE
			
 
				+  expression:
			
 
				+    single: path.startsWith("/private")
			
 
				+- name: robots-txt-policy-blacklist-3
			
 
				+  action: DENY
			
 
				+  expression:
			
 
				+    single: userAgent.contains("BadBot")
			
 
				+```
			
 
				+
			
 
				+## Using the Generated Policy
			
 
				+
			
 
				+Save the output and import it in your main policy file:
			
 
				+
			
 
				+```yaml
			
 
				+import:
			
 
				+  - path: "./robots-policy.yaml"
			
 
				+```
			
 
				+
			
 
				+The tool handles wildcard patterns, user-agent specific rules, and blacklisted bots automatically.
			
--- a/go.mod
+++ b/go.mod
@@ -12,7 +12,9 @@ require (
 
				 	github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a
			
 
				 	github.com/yl2chen/cidranger v1.0.2
			
 
				 	golang.org/x/net v0.41.0
			
 
				+	gopkg.in/yaml.v3 v3.0.1
			
 
				 	k8s.io/apimachinery v0.33.1
			
 
				+	sigs.k8s.io/yaml v1.4.0
			
 
				 )
			
 
				 
			
 
				 require (
			
@@ -104,11 +106,9 @@ require (
 
				 	google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 // indirect
			
 
				 	google.golang.org/protobuf v1.36.5 // indirect
			
 
				 	gopkg.in/warnings.v0 v0.1.2 // indirect
			
 
				-	gopkg.in/yaml.v3 v3.0.1 // indirect
			
 
				 	honnef.co/go/tools v0.6.1 // indirect
			
 
				 	mvdan.cc/sh/v3 v3.11.0 // indirect
			
 
				 	sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
			
 
				-	sigs.k8s.io/yaml v1.4.0 // indirect
			
 
				 )
			
 
				 
			
 
				 tool (
			
--- a/internal/ogtags/mem_test.go
+++ b/internal/ogtags/mem_test.go
@@ -1,11 +1,12 @@
 
				 package ogtags
			
 
				 
			
 
				 import (
			
 
				-	"golang.org/x/net/html"
			
 
				 	"net/url"
			
 
				 	"runtime"
			
 
				 	"strings"
			
 
				 	"testing"
			
 
				+
			
 
				+	"golang.org/x/net/html"
			
 
				 )
			
 
				 
			
 
				 func BenchmarkGetTarget(b *testing.B) {
			
--- a/internal/ogtags/ogtags.go
+++ b/internal/ogtags/ogtags.go
@@ -21,17 +21,17 @@ const (
 
				 )
			
 
				 
			
 
				 type OGTagCache struct {
			
 
				-	cache               *decaymap.Impl[string, map[string]string]
			
 
				-	targetURL           *url.URL
			
 
				-	client              *http.Client
			
 
				+	cache     *decaymap.Impl[string, map[string]string]
			
 
				+	targetURL *url.URL
			
 
				+	client    *http.Client
			
 
				+
			
 
				+	// Pre-built strings for optimization
			
 
				+	unixPrefix          string // "http://unix"
			
 
				 	approvedTags        []string
			
 
				 	approvedPrefixes    []string
			
 
				 	ogTimeToLive        time.Duration
			
 
				 	ogCacheConsiderHost bool
			
 
				 	ogPassthrough       bool
			
 
				-
			
 
				-	// Pre-built strings for optimization
			
 
				-	unixPrefix string // "http://unix"
			
 
				 }
			
 
				 
			
 
				 func NewOGTagCache(target string, ogPassthrough bool, ogTimeToLive time.Duration, ogTagsConsiderHost bool) *OGTagCache {
			
--- a/internal/ogtags/ogtags_fuzz_test.go
+++ b/internal/ogtags/ogtags_fuzz_test.go
@@ -1,11 +1,12 @@
 
				 package ogtags
			
 
				 
			
 
				 import (
			
 
				-	"golang.org/x/net/html"
			
 
				 	"net/url"
			
 
				 	"strings"
			
 
				 	"testing"
			
 
				 	"unicode/utf8"
			
 
				+
			
 
				+	"golang.org/x/net/html"
			
 
				 )
			
 
				 
			
 
				 // FuzzGetTarget tests getTarget with various inputs
			
--- a/lib/policy/config/config.go
+++ b/lib/policy/config/config.go
@@ -46,15 +46,15 @@ const (
 
				 const DefaultAlgorithm = "fast"
			
 
				 
			
 
				 type BotConfig struct {
			
 
				-	UserAgentRegex *string           `json:"user_agent_regex,omitempty"`
			
 
				-	PathRegex      *string           `json:"path_regex,omitempty"`
			
 
				-	HeadersRegex   map[string]string `json:"headers_regex,omitempty"`
			
 
				-	Expression     *ExpressionOrList `json:"expression,omitempty"`
			
 
				-	Challenge      *ChallengeRules   `json:"challenge,omitempty"`
			
 
				-	Weight         *Weight           `json:"weight,omitempty"`
			
 
				-	Name           string            `json:"name"`
			
 
				-	Action         Rule              `json:"action"`
			
 
				-	RemoteAddr     []string          `json:"remote_addresses,omitempty"`
			
 
				+	UserAgentRegex *string           `json:"user_agent_regex,omitempty" yaml:"user_agent_regex,omitempty"`
			
 
				+	PathRegex      *string           `json:"path_regex,omitempty" yaml:"path_regex,omitempty"`
			
 
				+	HeadersRegex   map[string]string `json:"headers_regex,omitempty" yaml:"headers_regex,omitempty"`
			
 
				+	Expression     *ExpressionOrList `json:"expression,omitempty" yaml:"expression,omitempty"`
			
 
				+	Challenge      *ChallengeRules   `json:"challenge,omitempty" yaml:"challenge,omitempty"`
			
 
				+	Weight         *Weight           `json:"weight,omitempty" yaml:"weight,omitempty"`
			
 
				+	Name           string            `json:"name" yaml:"name"`
			
 
				+	Action         Rule              `json:"action" yaml:"action"`
			
 
				+	RemoteAddr     []string          `json:"remote_addresses,omitempty" yaml:"remote_addresses,omitempty"`
			
 
				 }
			
 
				 
			
 
				 func (b BotConfig) Zero() bool {
			
@@ -170,9 +170,9 @@ func (b *BotConfig) Valid() error {
 
				 }
			
 
				 
			
 
				 type ChallengeRules struct {
			
 
				-	Algorithm  string `json:"algorithm"`
			
 
				-	Difficulty int    `json:"difficulty"`
			
 
				-	ReportAs   int    `json:"report_as"`
			
 
				+	Algorithm  string `json:"algorithm,omitempty" yaml:"algorithm,omitempty"`
			
 
				+	Difficulty int    `json:"difficulty,omitempty" yaml:"difficulty,omitempty"`
			
 
				+	ReportAs   int    `json:"report_as,omitempty" yaml:"report_as,omitempty"`
			
 
				 }
			
 
				 
			
 
				 var (
			
--- a/lib/policy/config/expressionorlist.go
+++ b/lib/policy/config/expressionorlist.go
@@ -13,9 +13,9 @@ var (
 
				 )
			
 
				 
			
 
				 type ExpressionOrList struct {
			
 
				-	Expression string   `json:"-"`
			
 
				-	All        []string `json:"all,omitempty"`
			
 
				-	Any        []string `json:"any,omitempty"`
			
 
				+	Expression string   `json:"-" yaml:"-"`
			
 
				+	All        []string `json:"all,omitempty" yaml:"all,omitempty"`
			
 
				+	Any        []string `json:"any,omitempty" yaml:"any,omitempty"`
			
 
				 }
			
 
				 
			
 
				 func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool {
			
@@ -34,6 +34,43 @@ func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool {
 
				 	return true
			
 
				 }
			
 
				 
			
 
				+func (eol *ExpressionOrList) MarshalYAML() (any, error) {
			
 
				+	switch {
			
 
				+	case len(eol.All) == 1 && len(eol.Any) == 0:
			
 
				+		eol.Expression = eol.All[0]
			
 
				+		eol.All = nil
			
 
				+	case len(eol.Any) == 1 && len(eol.All) == 0:
			
 
				+		eol.Expression = eol.Any[0]
			
 
				+		eol.Any = nil
			
 
				+	}
			
 
				+
			
 
				+	if eol.Expression != "" {
			
 
				+		return eol.Expression, nil
			
 
				+	}
			
 
				+
			
 
				+	type RawExpressionOrList ExpressionOrList
			
 
				+	return RawExpressionOrList(*eol), nil
			
 
				+}
			
 
				+
			
 
				+func (eol *ExpressionOrList) MarshalJSON() ([]byte, error) {
			
 
				+	switch {
			
 
				+	case len(eol.All) == 1 && len(eol.Any) == 0:
			
 
				+		eol.Expression = eol.All[0]
			
 
				+		eol.All = nil
			
 
				+	case len(eol.Any) == 1 && len(eol.All) == 0:
			
 
				+		eol.Expression = eol.Any[0]
			
 
				+		eol.Any = nil
			
 
				+	}
			
 
				+
			
 
				+	if eol.Expression != "" {
			
 
				+		return json.Marshal(string(eol.Expression))
			
 
				+	}
			
 
				+
			
 
				+	type RawExpressionOrList ExpressionOrList
			
 
				+	val := RawExpressionOrList(*eol)
			
 
				+	return json.Marshal(val)
			
 
				+}
			
 
				+
			
 
				 func (eol *ExpressionOrList) UnmarshalJSON(data []byte) error {
			
 
				 	switch string(data[0]) {
			
 
				 	case `"`: // string
			
--- a/lib/policy/config/expressionorlist_test.go
+++ b/lib/policy/config/expressionorlist_test.go
@@ -1,12 +1,147 @@
 
				 package config
			
 
				 
			
 
				 import (
			
 
				+	"bytes"
			
 
				 	"encoding/json"
			
 
				 	"errors"
			
 
				 	"testing"
			
 
				+
			
 
				+	yaml "sigs.k8s.io/yaml/goyaml.v3"
			
 
				 )
			
 
				 
			
 
				-func TestExpressionOrListUnmarshal(t *testing.T) {
			
 
				+func TestExpressionOrListMarshalJSON(t *testing.T) {
			
 
				+	for _, tt := range []struct {
			
 
				+		name   string
			
 
				+		input  *ExpressionOrList
			
 
				+		output []byte
			
 
				+		err    error
			
 
				+	}{
			
 
				+		{
			
 
				+			name: "single expression",
			
 
				+			input: &ExpressionOrList{
			
 
				+				Expression: "true",
			
 
				+			},
			
 
				+			output: []byte(`"true"`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "all",
			
 
				+			input: &ExpressionOrList{
			
 
				+				All: []string{"true", "true"},
			
 
				+			},
			
 
				+			output: []byte(`{"all":["true","true"]}`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "all one",
			
 
				+			input: &ExpressionOrList{
			
 
				+				All: []string{"true"},
			
 
				+			},
			
 
				+			output: []byte(`"true"`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "any",
			
 
				+			input: &ExpressionOrList{
			
 
				+				Any: []string{"true", "false"},
			
 
				+			},
			
 
				+			output: []byte(`{"any":["true","false"]}`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "any one",
			
 
				+			input: &ExpressionOrList{
			
 
				+				Any: []string{"true"},
			
 
				+			},
			
 
				+			output: []byte(`"true"`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+	} {
			
 
				+		t.Run(tt.name, func(t *testing.T) {
			
 
				+			result, err := json.Marshal(tt.input)
			
 
				+			if !errors.Is(err, tt.err) {
			
 
				+				t.Errorf("wanted marshal error: %v but got: %v", tt.err, err)
			
 
				+			}
			
 
				+
			
 
				+			if !bytes.Equal(result, tt.output) {
			
 
				+				t.Logf("wanted: %s", string(tt.output))
			
 
				+				t.Logf("got:    %s", string(result))
			
 
				+				t.Error("mismatched output")
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestExpressionOrListMarshalYAML(t *testing.T) {
			
 
				+	for _, tt := range []struct {
			
 
				+		name   string
			
 
				+		input  *ExpressionOrList
			
 
				+		output []byte
			
 
				+		err    error
			
 
				+	}{
			
 
				+		{
			
 
				+			name: "single expression",
			
 
				+			input: &ExpressionOrList{
			
 
				+				Expression: "true",
			
 
				+			},
			
 
				+			output: []byte(`"true"`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "all",
			
 
				+			input: &ExpressionOrList{
			
 
				+				All: []string{"true", "true"},
			
 
				+			},
			
 
				+			output: []byte(`all:
			
 
				+    - "true"
			
 
				+    - "true"`),
			
 
				+			err: nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "all one",
			
 
				+			input: &ExpressionOrList{
			
 
				+				All: []string{"true"},
			
 
				+			},
			
 
				+			output: []byte(`"true"`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "any",
			
 
				+			input: &ExpressionOrList{
			
 
				+				Any: []string{"true", "false"},
			
 
				+			},
			
 
				+			output: []byte(`any:
			
 
				+    - "true"
			
 
				+    - "false"`),
			
 
				+			err: nil,
			
 
				+		},
			
 
				+		{
			
 
				+			name: "any one",
			
 
				+			input: &ExpressionOrList{
			
 
				+				Any: []string{"true"},
			
 
				+			},
			
 
				+			output: []byte(`"true"`),
			
 
				+			err:    nil,
			
 
				+		},
			
 
				+	} {
			
 
				+		t.Run(tt.name, func(t *testing.T) {
			
 
				+			result, err := yaml.Marshal(tt.input)
			
 
				+			if !errors.Is(err, tt.err) {
			
 
				+				t.Errorf("wanted marshal error: %v but got: %v", tt.err, err)
			
 
				+			}
			
 
				+
			
 
				+			result = bytes.TrimSpace(result)
			
 
				+
			
 
				+			if !bytes.Equal(result, tt.output) {
			
 
				+				t.Logf("wanted: %q", string(tt.output))
			
 
				+				t.Logf("got:    %q", string(result))
			
 
				+				t.Error("mismatched output")
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestExpressionOrListUnmarshalJSON(t *testing.T) {
			
 
				 	for _, tt := range []struct {
			
 
				 		err      error
			
 
				 		validErr error
			
--- a/lib/policy/config/weight.go
+++ b/lib/policy/config/weight.go
@@ -1,5 +1,5 @@
 
				 package config
			
 
				 
			
 
				 type Weight struct {
			
 
				-	Adjust int `json:"adjust"`
			
 
				+	Adjust int `json:"adjust" yaml:"adjust"`
			
 
				 }
			
--- a/yeetfile.js
+++ b/yeetfile.js
@@ -22,6 +22,7 @@ $`npm run assets`;
 
				 
			
 
				         build: ({ bin, etc, systemd, doc }) => {
			
 
				             $`go build -o ${bin}/anubis -ldflags '-s -w -extldflags "-static" -X "github.com/TecharoHQ/anubis.Version=${git.tag()}"' ./cmd/anubis`;
			
 
				+            $`go build -o ${bin}/anubis-robots2policy -ldflags '-s -w -extldflags "-static" -X "github.com/TecharoHQ/anubis.Version=${git.tag()}"' ./cmd/robots2policy`;
			
 
				 
			
 
				             file.install("./run/anubis@.service", `${systemd}/anubis@.service`);
			
 
				             file.install("./run/default.env", `${etc}/default.env`);