|
@@ -13,6 +13,7 @@ import (
|
|
|
"github.com/blevesearch/bleve/v2/analysis/lang/en"
|
|
|
"github.com/blevesearch/bleve/v2/mapping"
|
|
|
"github.com/blevesearch/bleve/v2/search/query"
|
|
|
+ "github.com/gabriel-vasile/mimetype"
|
|
|
"github.com/uozi-tech/cosy/logger"
|
|
|
)
|
|
|
|
|
@@ -40,6 +41,12 @@ type SearchIndexer struct {
|
|
|
ctx context.Context
|
|
|
cancel context.CancelFunc
|
|
|
cleanupOnce sync.Once
|
|
|
+
|
|
|
+ // Memory management
|
|
|
+ totalContentSize int64
|
|
|
+ documentCount int64
|
|
|
+ maxMemoryUsage int64
|
|
|
+ memoryMutex sync.RWMutex
|
|
|
}
|
|
|
|
|
|
var (
|
|
@@ -57,7 +64,8 @@ func GetSearchIndexer() *SearchIndexer {
|
|
|
}
|
|
|
|
|
|
searchIndexer = &SearchIndexer{
|
|
|
- indexPath: tempDir,
|
|
|
+ indexPath: tempDir,
|
|
|
+ maxMemoryUsage: 100 * 1024 * 1024, // 100MB memory limit for indexed content
|
|
|
}
|
|
|
})
|
|
|
return searchIndexer
|
|
@@ -131,6 +139,12 @@ func (si *SearchIndexer) cleanup() {
|
|
|
si.index = nil
|
|
|
}
|
|
|
|
|
|
+ // Reset memory tracking
|
|
|
+ si.memoryMutex.Lock()
|
|
|
+ si.totalContentSize = 0
|
|
|
+ si.documentCount = 0
|
|
|
+ si.memoryMutex.Unlock()
|
|
|
+
|
|
|
// Remove the temporary directory
|
|
|
if err := os.RemoveAll(si.indexPath); err != nil {
|
|
|
logger.Error("Failed to remove search index directory:", err)
|
|
@@ -191,10 +205,10 @@ func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (er
|
|
|
}
|
|
|
}()
|
|
|
|
|
|
- // File size limit: 10MB to prevent memory overflow
|
|
|
- const maxFileSize = 10 * 1024 * 1024 // 10MB
|
|
|
+ // File size limit: 1MB to prevent memory overflow and improve performance
|
|
|
+ const maxFileSize = 1024 * 1024 // 1MB
|
|
|
if len(content) > maxFileSize {
|
|
|
- logger.Warn("Skipping file due to size limit", "path", configPath, "size", len(content), "limit", maxFileSize)
|
|
|
+ logger.Debugf("Skipping file due to size limit, path: %s, size: %d, limit: %d", configPath, len(content), maxFileSize)
|
|
|
return nil
|
|
|
}
|
|
|
|
|
@@ -203,9 +217,9 @@ func (si *SearchIndexer) handleConfigScan(configPath string, content []byte) (er
|
|
|
return nil
|
|
|
}
|
|
|
|
|
|
- // Basic content validation: check if it's text content
|
|
|
- if !isTextContent(content) {
|
|
|
- logger.Warn("Skipping non-text file", "path", configPath)
|
|
|
+ // Basic content validation: check if it's a configuration file
|
|
|
+ if !isConfigFile(content) {
|
|
|
+ logger.Debugf("Skipping non-config file: %s", configPath)
|
|
|
return nil
|
|
|
}
|
|
|
|
|
@@ -249,6 +263,18 @@ func (si *SearchIndexer) IndexDocument(doc SearchDocument) (err error) {
|
|
|
}
|
|
|
}()
|
|
|
|
|
|
+ // Additional size check as a safety measure
|
|
|
+ if len(doc.Content) > 2*1024*1024 { // 2MB absolute limit
|
|
|
+ return fmt.Errorf("document content too large: %d bytes", len(doc.Content))
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check memory usage before indexing
|
|
|
+ contentSize := int64(len(doc.Content))
|
|
|
+ if !si.checkMemoryLimitBeforeIndexing(contentSize) {
|
|
|
+ logger.Warn("Skipping document due to memory limit", "document_id", doc.ID, "content_size", contentSize)
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+
|
|
|
si.indexMutex.RLock()
|
|
|
defer si.indexMutex.RUnlock()
|
|
|
|
|
@@ -256,15 +282,19 @@ func (si *SearchIndexer) IndexDocument(doc SearchDocument) (err error) {
|
|
|
return fmt.Errorf("search index not initialized")
|
|
|
}
|
|
|
|
|
|
- // Additional size check as a safety measure
|
|
|
- if len(doc.Content) > 50*1024*1024 { // 50MB absolute limit
|
|
|
- return fmt.Errorf("document content too large: %d bytes", len(doc.Content))
|
|
|
+ // Index the document
|
|
|
+ err = si.index.Index(doc.ID, doc)
|
|
|
+ if err != nil {
|
|
|
+ return err
|
|
|
}
|
|
|
|
|
|
+ // Update memory usage tracking
|
|
|
+ si.updateMemoryUsage(doc.ID, contentSize, true)
|
|
|
+
|
|
|
// logger.Debugf("Indexing document: ID=%s, Type=%s, Name=%s, Path=%s",
|
|
|
// doc.ID, doc.Type, doc.Name, doc.Path)
|
|
|
|
|
|
- return si.index.Index(doc.ID, doc)
|
|
|
+ return nil
|
|
|
}
|
|
|
|
|
|
// Search performs a search query
|
|
@@ -324,7 +354,7 @@ func (si *SearchIndexer) searchWithType(ctx context.Context, queryStr string, do
|
|
|
}
|
|
|
results := si.convertResults(res.result)
|
|
|
|
|
|
- // Debug log the search execution
|
|
|
+ // log the search execution
|
|
|
logger.Debugf("Search index query '%s' (type: %s, limit: %d) returned %d results",
|
|
|
queryStr, docType, limit, len(results))
|
|
|
|
|
@@ -436,6 +466,10 @@ func (si *SearchIndexer) DeleteDocument(docID string) error {
|
|
|
return fmt.Errorf("search index not initialized")
|
|
|
}
|
|
|
|
|
|
+ // Note: We don't track the exact size of deleted documents here
|
|
|
+ // as it would require storing document sizes separately.
|
|
|
+ // The memory tracking will reset during periodic cleanups or restarts.
|
|
|
+
|
|
|
return si.index.Delete(docID)
|
|
|
}
|
|
|
|
|
@@ -499,9 +533,16 @@ func (si *SearchIndexer) GetIndexStats() (map[string]interface{}, error) {
|
|
|
return nil, err
|
|
|
}
|
|
|
|
|
|
+ // Get memory usage statistics
|
|
|
+ totalContentSize, trackedDocCount, maxMemoryUsage := si.getMemoryUsage()
|
|
|
+
|
|
|
return map[string]interface{}{
|
|
|
- "document_count": docCount,
|
|
|
- "index_path": si.indexPath,
|
|
|
+ "document_count": docCount,
|
|
|
+ "tracked_document_count": trackedDocCount,
|
|
|
+ "total_content_size": totalContentSize,
|
|
|
+ "max_memory_usage": maxMemoryUsage,
|
|
|
+ "memory_usage_percent": float64(totalContentSize) / float64(maxMemoryUsage) * 100,
|
|
|
+ "index_path": si.indexPath,
|
|
|
}, nil
|
|
|
}
|
|
|
|
|
@@ -537,49 +578,72 @@ func SearchAll(ctx context.Context, query string, limit int) ([]SearchResult, er
|
|
|
return GetSearchIndexer().Search(ctx, query, limit)
|
|
|
}
|
|
|
|
|
|
-// isTextContent checks if the content appears to be text-based
|
|
|
-// This helps prevent indexing binary files that might have been misidentified
|
|
|
-func isTextContent(content []byte) bool {
|
|
|
- if len(content) == 0 {
|
|
|
- return true // Empty content is considered text
|
|
|
- }
|
|
|
-
|
|
|
- // Check for common binary file signatures
|
|
|
- if len(content) >= 4 {
|
|
|
- // Check for some common binary file headers
|
|
|
- switch {
|
|
|
- case content[0] == 0x7F && content[1] == 0x45 && content[2] == 0x4C && content[3] == 0x46: // ELF
|
|
|
- return false
|
|
|
- case content[0] == 0x89 && content[1] == 0x50 && content[2] == 0x4E && content[3] == 0x47: // PNG
|
|
|
- return false
|
|
|
- case content[0] == 0xFF && content[1] == 0xD8 && content[2] == 0xFF: // JPEG
|
|
|
- return false
|
|
|
- case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x03 && content[3] == 0x04: // ZIP
|
|
|
- return false
|
|
|
- case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x05 && content[3] == 0x06: // ZIP (empty)
|
|
|
- return false
|
|
|
- case content[0] == 0x50 && content[1] == 0x4B && content[2] == 0x07 && content[3] == 0x08: // ZIP (spanned)
|
|
|
- return false
|
|
|
- }
|
|
|
+// checkMemoryLimitBeforeIndexing checks if adding new content would exceed memory limits
|
|
|
+func (si *SearchIndexer) checkMemoryLimitBeforeIndexing(contentSize int64) bool {
|
|
|
+ si.memoryMutex.RLock()
|
|
|
+ defer si.memoryMutex.RUnlock()
|
|
|
+
|
|
|
+ // Check if adding this content would exceed the memory limit
|
|
|
+ newTotalSize := si.totalContentSize + contentSize
|
|
|
+ if newTotalSize > si.maxMemoryUsage {
|
|
|
+ logger.Debugf("Memory limit would be exceeded: current=%d, new=%d, limit=%d",
|
|
|
+ si.totalContentSize, newTotalSize, si.maxMemoryUsage)
|
|
|
+ return false
|
|
|
}
|
|
|
|
|
|
- // Check if the first part of the content contains mostly printable characters
|
|
|
- // Sample up to 8KB for performance
|
|
|
- sampleSize := len(content)
|
|
|
- if sampleSize > 8192 {
|
|
|
- sampleSize = 8192
|
|
|
+ // Also check document count limit (max 1000 documents)
|
|
|
+ if si.documentCount >= 1000 {
|
|
|
+ logger.Debugf("Document count limit reached: %d", si.documentCount)
|
|
|
+ return false
|
|
|
}
|
|
|
|
|
|
- nonPrintableCount := 0
|
|
|
- for i := 0; i < sampleSize; i++ {
|
|
|
- b := content[i]
|
|
|
- // Allow printable ASCII characters, newlines, tabs, and carriage returns
|
|
|
- if (b < 32 && b != 9 && b != 10 && b != 13) || b > 126 {
|
|
|
- nonPrintableCount++
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+// updateMemoryUsage updates the memory usage tracking
|
|
|
+func (si *SearchIndexer) updateMemoryUsage(documentID string, contentSize int64, isAddition bool) {
|
|
|
+ si.memoryMutex.Lock()
|
|
|
+ defer si.memoryMutex.Unlock()
|
|
|
+
|
|
|
+ if isAddition {
|
|
|
+ si.totalContentSize += contentSize
|
|
|
+ si.documentCount++
|
|
|
+ logger.Debugf("Added document %s: size=%d, total_size=%d, count=%d",
|
|
|
+ documentID, contentSize, si.totalContentSize, si.documentCount)
|
|
|
+ } else {
|
|
|
+ si.totalContentSize -= contentSize
|
|
|
+ si.documentCount--
|
|
|
+ if si.totalContentSize < 0 {
|
|
|
+ si.totalContentSize = 0
|
|
|
}
|
|
|
+ if si.documentCount < 0 {
|
|
|
+ si.documentCount = 0
|
|
|
+ }
|
|
|
+ logger.Debugf("Removed document %s: size=%d, total_size=%d, count=%d",
|
|
|
+ documentID, contentSize, si.totalContentSize, si.documentCount)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// getMemoryUsage returns current memory usage statistics
|
|
|
+func (si *SearchIndexer) getMemoryUsage() (int64, int64, int64) {
|
|
|
+ si.memoryMutex.RLock()
|
|
|
+ defer si.memoryMutex.RUnlock()
|
|
|
+ return si.totalContentSize, si.documentCount, si.maxMemoryUsage
|
|
|
+}
|
|
|
+
|
|
|
+// isConfigFile checks if the content is a text/plain file (most nginx configs)
|
|
|
+func isConfigFile(content []byte) bool {
|
|
|
+ if len(content) == 0 {
|
|
|
+ return false // Empty files are not useful for configuration
|
|
|
+ }
|
|
|
+
|
|
|
+ // Detect MIME type and only accept text/plain
|
|
|
+ mtype := mimetype.Detect(content)
|
|
|
+
|
|
|
+ if mtype.Is("text/plain") {
|
|
|
+ return true
|
|
|
}
|
|
|
|
|
|
- // If more than 30% of the sampled content is non-printable, consider it binary
|
|
|
- threshold := float64(sampleSize) * 0.3
|
|
|
- return float64(nonPrintableCount) <= threshold
|
|
|
+ logger.Debugf("Skipping non-text/plain file with MIME type: %s", mtype.String())
|
|
|
+ return false
|
|
|
}
|