Przeglądaj źródła

perf: Replace internal SHA256 hashing with xxhash for 4-6x performance improvement (#676)

* perf(internal): Use FastHash for internal hashing
docs: Add xxhash performance improvement to changelog entry
feat(hash): Add fast non-cryptographic hash function

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* test(hash): add xxhash benchmarks and collision tests

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

* Update metadata

check-spelling run (pull_request) for json/hash

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

---------

Signed-off-by: Jason Cameron <git@jasoncameron.dev>
Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Jason Cameron 6 dni temu
rodzic
commit
e2b46fc5e7

+ 2 - 2
.github/actions/spelling/expect.txt

@@ -36,6 +36,7 @@ celchecker
 CELPHASE
 cerr
 certresolver
+cespare
 CGNAT
 cgr
 chainguard
@@ -186,7 +187,6 @@ OCOB
 ogtags
 omgili
 omgilibot
-onionservice
 openai
 openrc
 pag
@@ -214,6 +214,7 @@ qualys
 qwant
 qwantbot
 rac
+rawler
 rcvar
 redir
 redirectscheme
@@ -264,7 +265,6 @@ thoth
 thothmock
 Tik
 Timpibot
-torproject
 traefik
 uberspace
 unixhttpd

+ 1 - 0
docs/docs/CHANGELOG.md

@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `--strip-base-prefix` flag/envvar to strip the base prefix from request paths when forwarding to target servers
 - Add `robots2policy` CLI utility to convert robots.txt files to Anubis challenge policies using CEL expressions ([#409](https://github.com/TecharoHQ/anubis/issues/409))
 - Implement GeoIP and ASN based checks via [Thoth](https://anubis.techaro.lol/docs/admin/thoth) ([#206](https://github.com/TecharoHQ/anubis/issues/206))
+- Replace internal SHA256 hashing with xxhash for 4-6x performance improvement in policy evaluation and cache operations
 
 ## v1.19.1: Jenomis cen Lexentale - Echo 1
 

+ 3 - 3
go.mod

@@ -5,19 +5,21 @@ go 1.24.2
 require (
 	github.com/TecharoHQ/thoth-proto v0.4.0
 	github.com/a-h/templ v0.3.898
+	github.com/cespare/xxhash/v2 v2.3.0
 	github.com/facebookgo/flagenv v0.0.0-20160425205200-fcd59fca7456
 	github.com/gaissmai/bart v0.20.4
 	github.com/golang-jwt/jwt/v5 v5.2.2
 	github.com/google/cel-go v0.25.0
 	github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1
+	github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0
 	github.com/joho/godotenv v1.5.1
 	github.com/playwright-community/playwright-go v0.5200.0
 	github.com/prometheus/client_golang v1.22.0
 	github.com/sebest/xff v0.0.0-20210106013422-671bd2870b3a
 	github.com/yl2chen/cidranger v1.0.2
 	golang.org/x/net v0.41.0
-	gopkg.in/yaml.v3 v3.0.1
 	google.golang.org/grpc v1.72.2
+	gopkg.in/yaml.v3 v3.0.1
 	k8s.io/apimachinery v0.33.1
 	sigs.k8s.io/yaml v1.4.0
 )
@@ -43,7 +45,6 @@ require (
 	github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb // indirect
 	github.com/cavaliergopher/cpio v1.0.1 // indirect
 	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
-	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cli/browser v1.3.0 // indirect
 	github.com/cli/go-gh v0.1.0 // indirect
 	github.com/cloudflare/circl v1.6.1 // indirect
@@ -72,7 +73,6 @@ require (
 	github.com/goreleaser/chglog v0.7.0 // indirect
 	github.com/goreleaser/fileglob v1.3.0 // indirect
 	github.com/goreleaser/nfpm/v2 v2.42.1 // indirect
-	github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect
 	github.com/huandu/xstrings v1.5.0 // indirect
 	github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
 	github.com/kevinburke/ssh_config v1.2.0 // indirect

+ 13 - 0
internal/hash.go

@@ -3,10 +3,23 @@ package internal
 import (
 	"crypto/sha256"
 	"encoding/hex"
+	"strconv"
+
+	"github.com/cespare/xxhash/v2"
 )
 
+// SHA256sum computes a cryptographic hash. Still used for proof-of-work challenges
+// where we need the security properties of a cryptographic hash function.
 func SHA256sum(text string) string {
 	hash := sha256.New()
 	hash.Write([]byte(text))
 	return hex.EncodeToString(hash.Sum(nil))
 }
+
+// FastHash is a high-performance non-cryptographic hash function suitable for
+// internal caching, policy rule identification, and other performance-critical
+// use cases where cryptographic security is not required.
+func FastHash(text string) string {
+	h := xxhash.Sum64String(text)
+	return strconv.FormatUint(h, 16)
+}

+ 261 - 0
internal/hash_bench_test.go

@@ -0,0 +1,261 @@
+package internal
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+)
+
+// XXHash64sum is a test alias for FastHash to benchmark against SHA256
+func XXHash64sum(text string) string {
+	return FastHash(text)
+}
+
+// Test data that matches real usage patterns in the codebase
+var (
+	// Typical policy checker inputs
+	policyInputs = []string{
+		"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+		"User-Agent: bot/1.0",
+		"User-Agent: GoogleBot/2.1",
+		"/robots.txt",
+		"/api/.*",
+		"10.0.0.0/8",
+		"192.168.1.0/24",
+		"172.16.0.0/12",
+	}
+
+	// Challenge data from challengeFor function
+	challengeInputs = []string{
+		"Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5",
+		"Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3",
+		"Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7",
+	}
+
+	// Bot rule patterns
+	botRuleInputs = []string{
+		"GoogleBot::path:/robots.txt",
+		"BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)",
+		"FacebookBot::headers:Accept-Language,User-Agent",
+		"TwitterBot::cidr:192.168.1.0/24",
+	}
+
+	// CEL expressions from policy rules
+	celInputs = []string{
+		`request.headers["User-Agent"].contains("bot")`,
+		`request.path.startsWith("/api/") && request.method == "POST"`,
+		`request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`,
+		`request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`,
+	}
+
+	// Thoth ASN checker inputs
+	asnInputs = []string{
+		"ASNChecker\nAS 15169\nAS 8075\nAS 32934",
+		"ASNChecker\nAS 13335\nAS 16509\nAS 14061",
+		"ASNChecker\nAS 36351\nAS 20940\nAS 8100",
+	}
+)
+
+func BenchmarkSHA256_PolicyInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := policyInputs[i%len(policyInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_PolicyInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := policyInputs[i%len(policyInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_ChallengeInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := challengeInputs[i%len(challengeInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_ChallengeInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := challengeInputs[i%len(challengeInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_BotRuleInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := botRuleInputs[i%len(botRuleInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_BotRuleInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := botRuleInputs[i%len(botRuleInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_CELInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := celInputs[i%len(celInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_CELInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := celInputs[i%len(celInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_ASNInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := asnInputs[i%len(asnInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_ASNInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := asnInputs[i%len(asnInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+// Benchmark the policy list hashing used in checker.go
+func BenchmarkSHA256_PolicyList(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		var sb strings.Builder
+		for _, input := range policyInputs {
+			fmt.Fprintln(&sb, SHA256sum(input))
+		}
+		_ = SHA256sum(sb.String())
+	}
+}
+
+func BenchmarkXXHash_PolicyList(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		var sb strings.Builder
+		for _, input := range policyInputs {
+			fmt.Fprintln(&sb, XXHash64sum(input))
+		}
+		_ = XXHash64sum(sb.String())
+	}
+}
+
+// Tests that xxhash doesn't have collisions in realistic scenarios
+func TestHashCollisions(t *testing.T) {
+	allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...)
+
+	// Start with realistic inputs from actual usage
+	xxhashHashes := make(map[string]string)
+	for _, input := range allInputs {
+		hash := XXHash64sum(input)
+		if existing, exists := xxhashHashes[hash]; exists {
+			t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash)
+		}
+		xxhashHashes[hash] = input
+	}
+
+	t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs))
+
+	// Test similar strings that might cause hash collisions
+	prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "}
+	suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"}
+	variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"}
+
+	stressCount := 0
+	for _, prefix := range prefixes {
+		for _, suffix := range suffixes {
+			for _, variation := range variations {
+				for i := 0; i < 100; i++ {
+					input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i)
+					hash := XXHash64sum(input)
+					if existing, exists := xxhashHashes[hash]; exists {
+						t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash)
+					}
+					xxhashHashes[hash] = input
+					stressCount++
+				}
+			}
+		}
+	}
+	t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount)
+
+	// Test sequential patterns that might be problematic
+	patterns := []string{
+		"192.168.1.%d",
+		"10.0.0.%d",
+		"172.16.%d.1",
+		"challenge-%d",
+		"bot-rule-%d",
+		"policy-%016x",
+		"session-%016x",
+	}
+
+	seqCount := 0
+	for _, pattern := range patterns {
+		for i := 0; i < 10000; i++ {
+			input := fmt.Sprintf(pattern, i)
+			hash := XXHash64sum(input)
+			if existing, exists := xxhashHashes[hash]; exists {
+				t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash)
+			}
+			xxhashHashes[hash] = input
+			seqCount++
+		}
+	}
+	t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount)
+
+	totalInputs := len(allInputs) + stressCount + seqCount
+	t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs)
+}
+
+// Verify xxhash output works as cache keys
+func TestXXHashFormat(t *testing.T) {
+	testCases := []string{
+		"short",
+		"",
+		"very long string with lots of content that might be used in policy checking and other internal hashing scenarios",
+		"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+	}
+
+	for _, input := range testCases {
+		hash := XXHash64sum(input)
+
+		// Check it's valid hex
+		if len(hash) == 0 {
+			t.Errorf("Empty hash for input %q", input)
+		}
+
+		// xxhash is 64-bit so max 16 hex chars
+		if len(hash) > 16 {
+			t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash))
+		}
+
+		// Make sure it's all hex characters
+		for _, char := range hash {
+			if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) {
+				t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input)
+			}
+		}
+
+		t.Logf("Input: %q -> Hash: %s", input, hash)
+	}
+}

+ 1 - 1
internal/thoth/asnchecker.go

@@ -26,7 +26,7 @@ func (c *Client) ASNCheckerFor(asns []uint32) checker.Impl {
 	return &ASNChecker{
 		iptoasn: c.IPToASN,
 		asns:    asnMap,
-		hash:    internal.SHA256sum(sb.String()),
+		hash:    internal.FastHash(sb.String()),
 	}
 }
 

+ 1 - 1
lib/anubis.go

@@ -90,7 +90,7 @@ func (s *Server) challengeFor(r *http.Request, difficulty int) string {
 		fp,
 		difficulty,
 	)
-	return internal.SHA256sum(challengeData)
+	return internal.FastHash(challengeData)
 }
 
 func (s *Server) maybeReverseProxyHttpStatusOnly(w http.ResponseWriter, r *http.Request) {

+ 1 - 1
lib/policy/bot.go

@@ -17,5 +17,5 @@ type Bot struct {
 }
 
 func (b Bot) Hash() string {
-	return internal.SHA256sum(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash()))
+	return internal.FastHash(fmt.Sprintf("%s::%s", b.Name, b.Rules.Hash()))
 }

+ 1 - 1
lib/policy/celchecker.go

@@ -63,7 +63,7 @@ func NewCELChecker(cfg *config.ExpressionOrList) (*CELChecker, error) {
 }
 
 func (cc *CELChecker) Hash() string {
-	return internal.SHA256sum(cc.src)
+	return internal.FastHash(cc.src)
 }
 
 func (cc *CELChecker) Check(r *http.Request) (bool, error) {

+ 6 - 6
lib/policy/checker.go

@@ -28,7 +28,7 @@ func (staticHashChecker) Check(r *http.Request) (bool, error) {
 func (s staticHashChecker) Hash() string { return s.hash }
 
 func NewStaticHashChecker(hashable string) checker.Impl {
-	return staticHashChecker{hash: internal.SHA256sum(hashable)}
+	return staticHashChecker{hash: internal.FastHash(hashable)}
 }
 
 type RemoteAddrChecker struct {
@@ -55,7 +55,7 @@ func NewRemoteAddrChecker(cidrs []string) (checker.Impl, error) {
 
 	return &RemoteAddrChecker{
 		ranger: ranger,
-		hash:   internal.SHA256sum(sb.String()),
+		hash:   internal.FastHash(sb.String()),
 	}, nil
 }
 
@@ -101,7 +101,7 @@ func NewHeaderMatchesChecker(header, rexStr string) (checker.Impl, error) {
 	if err != nil {
 		return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
 	}
-	return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.SHA256sum(header + ": " + rexStr)}, nil
+	return &HeaderMatchesChecker{strings.TrimSpace(header), rex, internal.FastHash(header + ": " + rexStr)}, nil
 }
 
 func (hmc *HeaderMatchesChecker) Check(r *http.Request) (bool, error) {
@@ -126,7 +126,7 @@ func NewPathChecker(rexStr string) (checker.Impl, error) {
 	if err != nil {
 		return nil, fmt.Errorf("%w: regex %s failed parse: %w", ErrMisconfiguration, rexStr, err)
 	}
-	return &PathChecker{rex, internal.SHA256sum(rexStr)}, nil
+	return &PathChecker{rex, internal.FastHash(rexStr)}, nil
 }
 
 func (pc *PathChecker) Check(r *http.Request) (bool, error) {
@@ -158,7 +158,7 @@ func (hec headerExistsChecker) Check(r *http.Request) (bool, error) {
 }
 
 func (hec headerExistsChecker) Hash() string {
-	return internal.SHA256sum(hec.header)
+	return internal.FastHash(hec.header)
 }
 
 func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) {
@@ -177,7 +177,7 @@ func NewHeadersChecker(headermap map[string]string) (checker.Impl, error) {
 			continue
 		}
 
-		result = append(result, &HeaderMatchesChecker{key, rex, internal.SHA256sum(key + ": " + rexStr)})
+		result = append(result, &HeaderMatchesChecker{key, rex, internal.FastHash(key + ": " + rexStr)})
 	}
 
 	if len(errs) != 0 {

+ 1 - 1
lib/policy/checker/checker.go

@@ -37,5 +37,5 @@ func (l List) Hash() string {
 		fmt.Fprintln(&sb, c.Hash())
 	}
 
-	return internal.SHA256sum(sb.String())
+	return internal.FastHash(sb.String())
 }