Explorar o código

Don't index partial words

Daoud Clarke %!s(int64=3) %!d(string=hai) anos
pai
achega
01a21337a9
Modificáronse 2 ficheiros con 5 adicións e 5 borrados
  1. 4 4
      mwmbl/indexer/index.py
  2. 1 1
      mwmbl/tinysearchengine/create_app.py

+ 4 - 4
mwmbl/indexer/index.py

@@ -12,8 +12,6 @@ import pandas as pd
 # PAGE_SIZE = 512
 # PAGE_SIZE = 512
 from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
 from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
 
 
-NUM_INITIAL_TOKENS = 50
-
 HTTP_START = 'http://'
 HTTP_START = 'http://'
 HTTPS_START = 'https://'
 HTTPS_START = 'https://'
 BATCH_SIZE = 100
 BATCH_SIZE = 100
@@ -27,8 +25,10 @@ def is_content_token(nlp, token):
 def tokenize(nlp, input_text):
 def tokenize(nlp, input_text):
     cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
     cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
     tokens = nlp.tokenizer(cleaned_text)
     tokens = nlp.tokenizer(cleaned_text)
-    content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
-                      if is_content_token(nlp, token)]
+    if input_text.endswith('…'):
+        # Discard the last two tokens since there will likely be a word cut in two
+        tokens = tokens[:-2]
+    content_tokens = [token for token in tokens if is_content_token(nlp, token)]
     lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
     lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
     return lowered
     return lowered
 
 

+ 1 - 1
mwmbl/tinysearchengine/create_app.py

@@ -31,9 +31,9 @@ def create(tiny_index: TinyIndex):
     def search(s: str):
     def search(s: str):
         results, terms = get_results(s)
         results, terms = get_results(s)
 
 
+        pattern = get_query_regex(terms)
         formatted_results = []
         formatted_results = []
         for result in results:
         for result in results:
-            pattern = get_query_regex(terms)
             formatted_result = {}
             formatted_result = {}
             for content_type, content in [('title', result.title), ('extract', result.extract)]:
             for content_type, content in [('title', result.title), ('extract', result.extract)]:
                 matches = re.finditer(pattern, content, re.IGNORECASE)
                 matches = re.finditer(pattern, content, re.IGNORECASE)