Просмотр исходного кода

enhance: content validation in config indexing #1240

0xJacky 1 неделя назад
Родитель
Сommit
b8e017e715
1 измененных файлов с 88 добавлено и 2 удалено
  1. 88 2
      internal/cache/search.go

+ 88 - 2
internal/cache/search.go

@@ -182,7 +182,33 @@ func (si *SearchIndexer) createIndexMapping() mapping.IndexMapping {
 }
 
 // handleConfigScan processes scanned config files and indexes them
-func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) error {
+func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (err error) {
+	// Add panic recovery to prevent the entire application from crashing
+	defer func() {
+		if r := recover(); r != nil {
+			err = fmt.Errorf("panic during config scan: %v", r)
+			logger.Error("Panic occurred while scanning config", "config_path", configPath, "content_size", len(content), "error", err)
+		}
+	}()
+
+	// File size limit: 10MB to prevent memory overflow
+	const maxFileSize = 10 * 1024 * 1024 // 10MB
+	if len(content) > maxFileSize {
+		logger.Warn("Skipping file due to size limit", "path", configPath, "size", len(content), "limit", maxFileSize)
+		return nil
+	}
+
+	// Skip empty files
+	if len(content) == 0 {
+		return nil
+	}
+
+	// Basic content validation: check if it's text content
+	if !isTextContent(content) {
+		logger.Warn("Skipping non-text file", "path", configPath)
+		return nil
+	}
+
 	docType := si.determineConfigType(configPath)
 	if docType == "" {
 		return nil // Skip unsupported file types
@@ -214,7 +240,15 @@ func (si *SearchIndexer) determineConfigType(configPath string) string {
 }
 
 // IndexDocument indexes a single document
-func (si *SearchIndexer) IndexDocument(doc SearchDocument) error {
+func (si *SearchIndexer) IndexDocument(doc SearchDocument) (err error) {
+	// Add panic recovery to prevent the entire application from crashing
+	defer func() {
+		if r := recover(); r != nil {
+			err = fmt.Errorf("panic during indexing: %v", r)
+			logger.Error("Panic occurred while indexing document", "document_id", doc.ID, "error", err)
+		}
+	}()
+
 	si.indexMutex.RLock()
 	defer si.indexMutex.RUnlock()
 
@@ -222,6 +256,11 @@ func (si *SearchIndexer) IndexDocument(doc SearchDocument) error {
 		return fmt.Errorf("search index not initialized")
 	}
 
+	// Additional size check as a safety measure
+	if len(doc.Content) > 50*1024*1024 { // 50MB absolute limit
+		return fmt.Errorf("document content too large: %d bytes", len(doc.Content))
+	}
+
 	// logger.Debugf("Indexing document: ID=%s, Type=%s, Name=%s, Path=%s",
 	// 	doc.ID, doc.Type, doc.Name, doc.Path)
 
@@ -497,3 +536,50 @@ func SearchConfigs(ctx context.Context, query string, limit int) ([]SearchResult
 func SearchAll(ctx context.Context, query string, limit int) ([]SearchResult, error) {
 	return GetSearchIndexer().Search(ctx, query, limit)
 }
+
+// isTextContent checks if the content appears to be text-based
+// This helps prevent indexing binary files that might have been misidentified
+func isTextContent(content []byte) bool {
+	if len(content) == 0 {
+		return true // Empty content is considered text
+	}
+
+	// Check for common binary file signatures
+	if len(content) >= 4 {
+		// Check for some common binary file headers
+		switch {
+		case content[0] == 0x7F && content[1] == 0x45 && content[2] == 0x4C && content[3] == 0x46: // ELF
+			return false
+		case content[0] == 0x89 && content[1] == 0x50 && content[2] == 0x4E && content[3] == 0x47: // PNG
+			return false
+		case content[0] == 0xFF && content[1] == 0xD8 && content[2] == 0xFF: // JPEG
+			return false
+		case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x03 && content[3] == 0x04: // ZIP
+			return false
+		case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x05 && content[3] == 0x06: // ZIP (empty)
+			return false
+		case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x07 && content[3] == 0x08: // ZIP (spanned)
+			return false
+		}
+	}
+
+	// Check if the first part of the content contains mostly printable characters
+	// Sample up to 8KB for performance
+	sampleSize := len(content)
+	if sampleSize > 8192 {
+		sampleSize = 8192
+	}
+
+	nonPrintableCount := 0
+	for i := 0; i < sampleSize; i++ {
+		b := content[i]
+		// Allow printable ASCII characters, newlines, tabs, and carriage returns
+		if (b < 32 && b != 9 && b != 10 && b != 13) || b > 126 {
+			nonPrintableCount++
+		}
+	}
+
+	// If more than 30% of the sampled content is non-printable, consider it binary
+	threshold := float64(sampleSize) * 0.3
+	return float64(nonPrintableCount) <= threshold
+}