batch_process.go 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. /*
  2. Batch process robots.txt files from archives like https://github.com/nrjones8/robots-dot-txt-archive-bot/tree/master/data/cleaned
  3. into Anubis CEL policies. Usage: go run batch_process.go <directory with robots.txt files>
  4. */
  5. package main
  6. import (
  7. "fmt"
  8. "io/fs"
  9. "log"
  10. "os"
  11. "os/exec"
  12. "path/filepath"
  13. "strings"
  14. )
  15. func main() {
  16. if len(os.Args) < 2 {
  17. fmt.Println("Usage: go run batch_process.go <cleaned_directory>")
  18. fmt.Println("Example: go run batch_process.go ./cleaned")
  19. os.Exit(1)
  20. }
  21. cleanedDir := os.Args[1]
  22. outputDir := "generated_policies"
  23. // Create output directory
  24. if err := os.MkdirAll(outputDir, 0755); err != nil {
  25. log.Fatalf("Failed to create output directory: %v", err)
  26. }
  27. count := 0
  28. err := filepath.WalkDir(cleanedDir, func(path string, d fs.DirEntry, err error) error {
  29. if err != nil {
  30. return err
  31. }
  32. // Skip directories
  33. if d.IsDir() {
  34. return nil
  35. }
  36. // Generate policy name from file path
  37. relPath, _ := filepath.Rel(cleanedDir, path)
  38. policyName := strings.ReplaceAll(relPath, "/", "-")
  39. policyName = strings.TrimSuffix(policyName, "-robots.txt")
  40. policyName = strings.ReplaceAll(policyName, ".", "-")
  41. outputFile := filepath.Join(outputDir, policyName+".yaml")
  42. cmd := exec.Command("go", "run", "main.go",
  43. "-input", path,
  44. "-output", outputFile,
  45. "-name", policyName,
  46. "-format", "yaml")
  47. if err := cmd.Run(); err != nil {
  48. fmt.Printf("Warning: Failed to process %s: %v\n", path, err)
  49. return nil // Continue processing other files
  50. }
  51. count++
  52. if count%100 == 0 {
  53. fmt.Printf("Processed %d files...\n", count)
  54. } else if count%10 == 0 {
  55. fmt.Print(".")
  56. }
  57. return nil
  58. })
  59. if err != nil {
  60. log.Fatalf("Error walking directory: %v", err)
  61. }
  62. fmt.Printf("Successfully processed %d robots.txt files\n", count)
  63. fmt.Printf("Generated policies saved to: %s/\n", outputDir)
  64. }