Pārlūkot izejas kodu

feat(expressions): add randInt function to allow making rules nondeterministic (#578)

This seems counter-intuitive at first glance, but let me cook.

One of the problems with Anubis is that the rule matching is super
deterministic. This means that attackers can figure out what patterns
they are hitting and change things to bypass them.

The randInt function lets you have rulesets behave nondeterministically.
This is a very easy way to hang yourself, but can be great to
psychologically mess with scraper operators. Consider this rule:

```yaml
- name: deny-lightpanda-sometimes
  action: DENY
  expression:
    all:
      - userAgent.matches("LightPanda")
      - randInt(16) >= 4
```

It would match about 75% of the time.

Signed-off-by: Xe Iaso <me@xeiaso.net>
Xe Iaso 3 nedēļas atpakaļ
vecāks
revīzija
22c47f40d1

+ 39 - 39
data/botPolicies.yaml

@@ -11,44 +11,44 @@
 ## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.
 ## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.
 
 
 bots:
 bots:
-# Pathological bots to deny
-- # This correlates to data/bots/deny-pathological.yaml in the source tree
-  # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml
-  import: (data)/bots/_deny-pathological.yaml
-- import: (data)/bots/aggressive-brazilian-scrapers.yaml
-
-# Enforce https://github.com/ai-robots-txt/ai.robots.txt
-- import: (data)/bots/ai-robots-txt.yaml
-
-# Search engine crawlers to allow, defaults to:
-#   - Google (so they don't try to bypass Anubis)
-#   - Bing
-#   - DuckDuckGo
-#   - Qwant
-#   - The Internet Archive
-#   - Kagi
-#   - Marginalia
-#   - Mojeek
-- import: (data)/crawlers/_allow-good.yaml
-
-# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
-- import: (data)/common/keep-internet-working.yaml
-
-# # Punish any bot with "bot" in the user-agent string
-# # This is known to have a high false-positive rate, use at your own risk
-# - name: generic-bot-catchall
-#   user_agent_regex: (?i:bot|crawler)
-#   action: CHALLENGE
-#   challenge:
-#     difficulty: 16  # impossible
-#     report_as: 4    # lie to the operator
-#     algorithm: slow # intentionally waste CPU cycles and time
-
-# Generic catchall rule
-- name: generic-browser
-  user_agent_regex: >-
-    Mozilla|Opera
-  action: CHALLENGE
+  # Pathological bots to deny
+  - # This correlates to data/bots/deny-pathological.yaml in the source tree
+    # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml
+    import: (data)/bots/_deny-pathological.yaml
+  - import: (data)/bots/aggressive-brazilian-scrapers.yaml
+
+  # Enforce https://github.com/ai-robots-txt/ai.robots.txt
+  - import: (data)/bots/ai-robots-txt.yaml
+
+  # Search engine crawlers to allow, defaults to:
+  #   - Google (so they don't try to bypass Anubis)
+  #   - Bing
+  #   - DuckDuckGo
+  #   - Qwant
+  #   - The Internet Archive
+  #   - Kagi
+  #   - Marginalia
+  #   - Mojeek
+  - import: (data)/crawlers/_allow-good.yaml
+
+  # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
+  - import: (data)/common/keep-internet-working.yaml
+
+  # # Punish any bot with "bot" in the user-agent string
+  # # This is known to have a high false-positive rate, use at your own risk
+  # - name: generic-bot-catchall
+  #   user_agent_regex: (?i:bot|crawler)
+  #   action: CHALLENGE
+  #   challenge:
+  #     difficulty: 16  # impossible
+  #     report_as: 4    # lie to the operator
+  #     algorithm: slow # intentionally waste CPU cycles and time
+
+  # Generic catchall rule
+  - name: generic-browser
+    user_agent_regex: >-
+      Mozilla|Opera
+    action: CHALLENGE
 
 
 dnsbl: false
 dnsbl: false
 
 
@@ -58,4 +58,4 @@ dnsbl: false
 # will stop sending requests once they get it.
 # will stop sending requests once they get it.
 status_codes:
 status_codes:
   CHALLENGE: 200
   CHALLENGE: 200
-  DENY: 200 
+  DENY: 200

+ 23 - 1
docs/docs/admin/configuration/expressions.mdx

@@ -143,7 +143,29 @@ Anubis would return a challenge because all of those conditions are true.
 
 
 ## Functions exposed to Anubis expressions
 ## Functions exposed to Anubis expressions
 
 
-There are currently no functions from the Anubis runtime exposed to expressions. This will change in the future.
+Anubis expressions can be augmented with the following functions:
+
+### `randInt`
+
+```ts
+function randInt(n: int): int;
+```
+
+randInt returns a randomly selected integer value in the range of `[0,n)`. This is a thin wrapper around [Go's math/rand#Intn](https://pkg.go.dev/math/rand#Intn). Be careful with this as it may cause inconsistent behavior for genuine users.
+
+This is best applied when doing explicit block rules, eg:
+
+```yaml
+# Denies LightPanda about 75% of the time on average
+- name: deny-lightpanda-sometimes
+  action: DENY
+  expression:
+    all:
+      - userAgent.matches("LightPanda")
+      - randInt(16) >= 4
+```
+
+It seems counter-intuitive to allow known bad clients through sometimes, but this allows you to confuse attackers by making Anubis' behavior random. Adjust the thresholds and numbers as facts and circumstances demand.
 
 
 ## Life advice
 ## Life advice
 
 

+ 8 - 0
lib/policy/config/testdata/good/entropy.yaml

@@ -0,0 +1,8 @@
+bots:
+  - name: total-randomness
+    action: ALLOW
+    expression:
+      all:
+        - '"Accept" in headers'
+        - headers["Accept"].contains("text/html")
+        - randInt(1) == 0

+ 18 - 0
lib/policy/expressions/environment.go

@@ -1,7 +1,11 @@
 package expressions
 package expressions
 
 
 import (
 import (
+	"math/rand/v2"
+
 	"github.com/google/cel-go/cel"
 	"github.com/google/cel-go/cel"
+	"github.com/google/cel-go/common/types"
+	"github.com/google/cel-go/common/types/ref"
 	"github.com/google/cel-go/ext"
 	"github.com/google/cel-go/ext"
 )
 )
 
 
@@ -29,6 +33,20 @@ func NewEnvironment() (*cel.Env, error) {
 		cel.Variable("headers", cel.MapType(cel.StringType, cel.StringType)),
 		cel.Variable("headers", cel.MapType(cel.StringType, cel.StringType)),
 
 
 		// Functions exposed to CEL programs:
 		// Functions exposed to CEL programs:
+		cel.Function("randInt",
+			cel.Overload("randInt_int",
+				[]*cel.Type{cel.IntType},
+				cel.IntType,
+				cel.UnaryBinding(func(val ref.Val) ref.Val {
+					n, ok := val.(types.Int)
+					if !ok {
+						return types.ValOrErr(val, "value is not an integer, but is %T", val)
+					}
+
+					return types.Int(rand.IntN(int(n)))
+				}),
+			),
+		),
 	)
 	)
 }
 }