|
@@ -0,0 +1,418 @@
|
|
|
|
+package main
|
|
|
|
+
|
|
|
|
+import (
|
|
|
|
+ "encoding/json"
|
|
|
|
+ "fmt"
|
|
|
|
+ "os"
|
|
|
|
+ "path/filepath"
|
|
|
|
+ "reflect"
|
|
|
|
+ "strings"
|
|
|
|
+ "testing"
|
|
|
|
+
|
|
|
|
+ "gopkg.in/yaml.v3"
|
|
|
|
+)
|
|
|
|
+
|
|
|
|
+type TestCase struct {
|
|
|
|
+ name string
|
|
|
|
+ robotsFile string
|
|
|
|
+ expectedFile string
|
|
|
|
+ options TestOptions
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+type TestOptions struct {
|
|
|
|
+ format string
|
|
|
|
+ action string
|
|
|
|
+ crawlDelayWeight int
|
|
|
|
+ policyName string
|
|
|
|
+ deniedAction string
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func TestDataFileConversion(t *testing.T) {
|
|
|
|
+
|
|
|
|
+ testCases := []TestCase{
|
|
|
|
+ {
|
|
|
|
+ name: "simple_default",
|
|
|
|
+ robotsFile: "simple.robots.txt",
|
|
|
|
+ expectedFile: "simple.yaml",
|
|
|
|
+ options: TestOptions{format: "yaml"},
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ name: "simple_json",
|
|
|
|
+ robotsFile: "simple.robots.txt",
|
|
|
|
+ expectedFile: "simple.json",
|
|
|
|
+ options: TestOptions{format: "json"},
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ name: "simple_deny_action",
|
|
|
|
+ robotsFile: "simple.robots.txt",
|
|
|
|
+ expectedFile: "deny-action.yaml",
|
|
|
|
+ options: TestOptions{format: "yaml", action: "DENY"},
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ name: "simple_custom_name",
|
|
|
|
+ robotsFile: "simple.robots.txt",
|
|
|
|
+ expectedFile: "custom-name.yaml",
|
|
|
|
+ options: TestOptions{format: "yaml", policyName: "my-custom-policy"},
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ name: "blacklist_with_crawl_delay",
|
|
|
|
+ robotsFile: "blacklist.robots.txt",
|
|
|
|
+ expectedFile: "blacklist.yaml",
|
|
|
|
+ options: TestOptions{format: "yaml", crawlDelayWeight: 3},
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ name: "wildcards",
|
|
|
|
+ robotsFile: "wildcards.robots.txt",
|
|
|
|
+ expectedFile: "wildcards.yaml",
|
|
|
|
+ options: TestOptions{format: "yaml"},
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ name: "empty_file",
|
|
|
|
+ robotsFile: "empty.robots.txt",
|
|
|
|
+ expectedFile: "empty.yaml",
|
|
|
|
+ options: TestOptions{format: "yaml"},
|
|
|
|
+ },
|
|
|
|
+ {
|
|
|
|
+ name: "complex_scenario",
|
|
|
|
+ robotsFile: "complex.robots.txt",
|
|
|
|
+ expectedFile: "complex.yaml",
|
|
|
|
+ options: TestOptions{format: "yaml", crawlDelayWeight: 5},
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ for _, tc := range testCases {
|
|
|
|
+ t.Run(tc.name, func(t *testing.T) {
|
|
|
|
+ robotsPath := filepath.Join("testdata", tc.robotsFile)
|
|
|
|
+ expectedPath := filepath.Join("testdata", tc.expectedFile)
|
|
|
|
+
|
|
|
|
+ // Read robots.txt input
|
|
|
|
+ robotsFile, err := os.Open(robotsPath)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to open robots file %s: %v", robotsPath, err)
|
|
|
|
+ }
|
|
|
|
+ defer robotsFile.Close()
|
|
|
|
+
|
|
|
|
+ // Parse robots.txt
|
|
|
|
+ rules, err := parseRobotsTxt(robotsFile)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to parse robots.txt: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Set test options
|
|
|
|
+ oldFormat := *outputFormat
|
|
|
|
+ oldAction := *baseAction
|
|
|
|
+ oldCrawlDelay := *crawlDelay
|
|
|
|
+ oldPolicyName := *policyName
|
|
|
|
+ oldDeniedAction := *userAgentDeny
|
|
|
|
+
|
|
|
|
+ if tc.options.format != "" {
|
|
|
|
+ *outputFormat = tc.options.format
|
|
|
|
+ }
|
|
|
|
+ if tc.options.action != "" {
|
|
|
|
+ *baseAction = tc.options.action
|
|
|
|
+ }
|
|
|
|
+ if tc.options.crawlDelayWeight > 0 {
|
|
|
|
+ *crawlDelay = tc.options.crawlDelayWeight
|
|
|
|
+ }
|
|
|
|
+ if tc.options.policyName != "" {
|
|
|
|
+ *policyName = tc.options.policyName
|
|
|
|
+ }
|
|
|
|
+ if tc.options.deniedAction != "" {
|
|
|
|
+ *userAgentDeny = tc.options.deniedAction
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Restore options after test
|
|
|
|
+ defer func() {
|
|
|
|
+ *outputFormat = oldFormat
|
|
|
|
+ *baseAction = oldAction
|
|
|
|
+ *crawlDelay = oldCrawlDelay
|
|
|
|
+ *policyName = oldPolicyName
|
|
|
|
+ *userAgentDeny = oldDeniedAction
|
|
|
|
+ }()
|
|
|
|
+
|
|
|
|
+ // Convert to Anubis rules
|
|
|
|
+ anubisRules := convertToAnubisRules(rules)
|
|
|
|
+
|
|
|
|
+ // Generate output
|
|
|
|
+ var actualOutput []byte
|
|
|
|
+ switch strings.ToLower(*outputFormat) {
|
|
|
|
+ case "yaml":
|
|
|
|
+ actualOutput, err = yaml.Marshal(anubisRules)
|
|
|
|
+ case "json":
|
|
|
|
+ actualOutput, err = json.MarshalIndent(anubisRules, "", " ")
|
|
|
|
+ }
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to marshal output: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Read expected output
|
|
|
|
+ expectedOutput, err := os.ReadFile(expectedPath)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to read expected file %s: %v", expectedPath, err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if strings.ToLower(*outputFormat) == "yaml" {
|
|
|
|
+ var actualData []interface{}
|
|
|
|
+ var expectedData []interface{}
|
|
|
|
+
|
|
|
|
+ err = yaml.Unmarshal(actualOutput, &actualData)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to unmarshal actual output: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ err = yaml.Unmarshal(expectedOutput, &expectedData)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to unmarshal expected output: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Compare data structures
|
|
|
|
+ if !compareData(actualData, expectedData) {
|
|
|
|
+ actualStr := strings.TrimSpace(string(actualOutput))
|
|
|
|
+ expectedStr := strings.TrimSpace(string(expectedOutput))
|
|
|
|
+ t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ var actualData []interface{}
|
|
|
|
+ var expectedData []interface{}
|
|
|
|
+
|
|
|
|
+ err = json.Unmarshal(actualOutput, &actualData)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to unmarshal actual JSON output: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ err = json.Unmarshal(expectedOutput, &expectedData)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to unmarshal expected JSON output: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Compare data structures
|
|
|
|
+ if !compareData(actualData, expectedData) {
|
|
|
|
+ actualStr := strings.TrimSpace(string(actualOutput))
|
|
|
|
+ expectedStr := strings.TrimSpace(string(expectedOutput))
|
|
|
|
+ t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ })
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func TestCaseInsensitiveParsing(t *testing.T) {
|
|
|
|
+ robotsTxt := `User-Agent: *
|
|
|
|
+Disallow: /admin
|
|
|
|
+Crawl-Delay: 10
|
|
|
|
+
|
|
|
|
+User-agent: TestBot
|
|
|
|
+disallow: /test
|
|
|
|
+crawl-delay: 5
|
|
|
|
+
|
|
|
|
+USER-AGENT: UpperBot
|
|
|
|
+DISALLOW: /upper
|
|
|
|
+CRAWL-DELAY: 20`
|
|
|
|
+
|
|
|
|
+ reader := strings.NewReader(robotsTxt)
|
|
|
|
+ rules, err := parseRobotsTxt(reader)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to parse case-insensitive robots.txt: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ expectedRules := 3
|
|
|
|
+ if len(rules) != expectedRules {
|
|
|
|
+ t.Errorf("Expected %d rules, got %d", expectedRules, len(rules))
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Check that all crawl delays were parsed
|
|
|
|
+ for i, rule := range rules {
|
|
|
|
+ expectedDelays := []int{10, 5, 20}
|
|
|
|
+ if rule.CrawlDelay != expectedDelays[i] {
|
|
|
|
+ t.Errorf("Rule %d: expected crawl delay %d, got %d", i, expectedDelays[i], rule.CrawlDelay)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func TestVariousOutputFormats(t *testing.T) {
|
|
|
|
+ robotsTxt := `User-agent: *
|
|
|
|
+Disallow: /admin`
|
|
|
|
+
|
|
|
|
+ reader := strings.NewReader(robotsTxt)
|
|
|
|
+ rules, err := parseRobotsTxt(reader)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to parse robots.txt: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ oldPolicyName := *policyName
|
|
|
|
+ *policyName = "test-policy"
|
|
|
|
+ defer func() { *policyName = oldPolicyName }()
|
|
|
|
+
|
|
|
|
+ anubisRules := convertToAnubisRules(rules)
|
|
|
|
+
|
|
|
|
+ // Test YAML output
|
|
|
|
+ yamlOutput, err := yaml.Marshal(anubisRules)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to marshal YAML: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if !strings.Contains(string(yamlOutput), "name: test-policy-disallow-1") {
|
|
|
|
+ t.Errorf("YAML output doesn't contain expected rule name")
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Test JSON output
|
|
|
|
+ jsonOutput, err := json.MarshalIndent(anubisRules, "", " ")
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to marshal JSON: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if !strings.Contains(string(jsonOutput), `"name": "test-policy-disallow-1"`) {
|
|
|
|
+ t.Errorf("JSON output doesn't contain expected rule name")
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func TestDifferentActions(t *testing.T) {
|
|
|
|
+ robotsTxt := `User-agent: *
|
|
|
|
+Disallow: /admin`
|
|
|
|
+
|
|
|
|
+ testActions := []string{"ALLOW", "DENY", "CHALLENGE", "WEIGH"}
|
|
|
|
+
|
|
|
|
+ for _, action := range testActions {
|
|
|
|
+ t.Run("action_"+action, func(t *testing.T) {
|
|
|
|
+ reader := strings.NewReader(robotsTxt)
|
|
|
|
+ rules, err := parseRobotsTxt(reader)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to parse robots.txt: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ oldAction := *baseAction
|
|
|
|
+ *baseAction = action
|
|
|
|
+ defer func() { *baseAction = oldAction }()
|
|
|
|
+
|
|
|
|
+ anubisRules := convertToAnubisRules(rules)
|
|
|
|
+
|
|
|
|
+ if len(anubisRules) != 1 {
|
|
|
|
+ t.Fatalf("Expected 1 rule, got %d", len(anubisRules))
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if anubisRules[0].Action != action {
|
|
|
|
+ t.Errorf("Expected action %s, got %s", action, anubisRules[0].Action)
|
|
|
|
+ }
|
|
|
|
+ })
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func TestPolicyNaming(t *testing.T) {
|
|
|
|
+ robotsTxt := `User-agent: *
|
|
|
|
+Disallow: /admin
|
|
|
|
+Disallow: /private
|
|
|
|
+
|
|
|
|
+User-agent: BadBot
|
|
|
|
+Disallow: /`
|
|
|
|
+
|
|
|
|
+ testNames := []string{"custom-policy", "my-rules", "site-protection"}
|
|
|
|
+
|
|
|
|
+ for _, name := range testNames {
|
|
|
|
+ t.Run("name_"+name, func(t *testing.T) {
|
|
|
|
+ reader := strings.NewReader(robotsTxt)
|
|
|
|
+ rules, err := parseRobotsTxt(reader)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to parse robots.txt: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ oldName := *policyName
|
|
|
|
+ *policyName = name
|
|
|
|
+ defer func() { *policyName = oldName }()
|
|
|
|
+
|
|
|
|
+ anubisRules := convertToAnubisRules(rules)
|
|
|
|
+
|
|
|
|
+ // Check that all rule names use the custom prefix
|
|
|
|
+ for _, rule := range anubisRules {
|
|
|
|
+ if !strings.HasPrefix(rule.Name, name+"-") {
|
|
|
|
+ t.Errorf("Rule name %s doesn't start with expected prefix %s-", rule.Name, name)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ })
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func TestCrawlDelayWeights(t *testing.T) {
|
|
|
|
+ robotsTxt := `User-agent: *
|
|
|
|
+Disallow: /admin
|
|
|
|
+Crawl-delay: 10
|
|
|
|
+
|
|
|
|
+User-agent: SlowBot
|
|
|
|
+Disallow: /slow
|
|
|
|
+Crawl-delay: 60`
|
|
|
|
+
|
|
|
|
+ testWeights := []int{1, 5, 10, 25}
|
|
|
|
+
|
|
|
|
+ for _, weight := range testWeights {
|
|
|
|
+ t.Run(fmt.Sprintf("weight_%d", weight), func(t *testing.T) {
|
|
|
|
+ reader := strings.NewReader(robotsTxt)
|
|
|
|
+ rules, err := parseRobotsTxt(reader)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to parse robots.txt: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ oldWeight := *crawlDelay
|
|
|
|
+ *crawlDelay = weight
|
|
|
|
+ defer func() { *crawlDelay = oldWeight }()
|
|
|
|
+
|
|
|
|
+ anubisRules := convertToAnubisRules(rules)
|
|
|
|
+
|
|
|
|
+ // Count weight rules and verify they have correct weight
|
|
|
|
+ weightRules := 0
|
|
|
|
+ for _, rule := range anubisRules {
|
|
|
|
+ if rule.Action == "WEIGH" && rule.Weight != nil {
|
|
|
|
+ weightRules++
|
|
|
|
+ if rule.Weight.Adjust != weight {
|
|
|
|
+ t.Errorf("Expected weight %d, got %d", weight, rule.Weight.Adjust)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ expectedWeightRules := 2 // One for *, one for SlowBot
|
|
|
|
+ if weightRules != expectedWeightRules {
|
|
|
|
+ t.Errorf("Expected %d weight rules, got %d", expectedWeightRules, weightRules)
|
|
|
|
+ }
|
|
|
|
+ })
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func TestBlacklistActions(t *testing.T) {
|
|
|
|
+ robotsTxt := `User-agent: BadBot
|
|
|
|
+Disallow: /
|
|
|
|
+
|
|
|
|
+User-agent: SpamBot
|
|
|
|
+Disallow: /`
|
|
|
|
+
|
|
|
|
+ testActions := []string{"DENY", "CHALLENGE"}
|
|
|
|
+
|
|
|
|
+ for _, action := range testActions {
|
|
|
|
+ t.Run("blacklist_"+action, func(t *testing.T) {
|
|
|
|
+ reader := strings.NewReader(robotsTxt)
|
|
|
|
+ rules, err := parseRobotsTxt(reader)
|
|
|
|
+ if err != nil {
|
|
|
|
+ t.Fatalf("Failed to parse robots.txt: %v", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ oldAction := *userAgentDeny
|
|
|
|
+ *userAgentDeny = action
|
|
|
|
+ defer func() { *userAgentDeny = oldAction }()
|
|
|
|
+
|
|
|
|
+ anubisRules := convertToAnubisRules(rules)
|
|
|
|
+
|
|
|
|
+ // All rules should be blacklist rules with the specified action
|
|
|
|
+ for _, rule := range anubisRules {
|
|
|
|
+ if !strings.Contains(rule.Name, "blacklist") {
|
|
|
|
+ t.Errorf("Expected blacklist rule, got %s", rule.Name)
|
|
|
|
+ }
|
|
|
|
+ if rule.Action != action {
|
|
|
|
+ t.Errorf("Expected action %s, got %s", action, rule.Action)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ })
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// compareData performs a deep comparison of two data structures,
|
|
|
|
+// ignoring differences that are semantically equivalent in YAML/JSON
|
|
|
|
+func compareData(actual, expected interface{}) bool {
|
|
|
|
+ return reflect.DeepEqual(actual, expected)
|
|
|
|
+}
|