3 年之前 · 01a21337a9
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@@ -12,8 +12,6 @@ import pandas as pd
 
				 # PAGE_SIZE = 512
			
 
				 from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
			
 
				 
			
 
				-NUM_INITIAL_TOKENS = 50
			
 
				-
			
 
				 HTTP_START = 'http://'
			
 
				 HTTPS_START = 'https://'
			
 
				 BATCH_SIZE = 100
			
@@ -27,8 +25,10 @@ def is_content_token(nlp, token):
 
				 def tokenize(nlp, input_text):
			
 
				     cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
			
 
				     tokens = nlp.tokenizer(cleaned_text)
			
 
				-    content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
			
 
				-                      if is_content_token(nlp, token)]
			
 
				+    if input_text.endswith('…'):
			
 
				+        # Discard the last two tokens since there will likely be a word cut in two
			
 
				+        tokens = tokens[:-2]
			
 
				+    content_tokens = [token for token in tokens if is_content_token(nlp, token)]
			
 
				     lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
			
 
				     return lowered
			
 
				 
			
--- a/mwmbl/tinysearchengine/create_app.py
+++ b/mwmbl/tinysearchengine/create_app.py
@@ -31,9 +31,9 @@ def create(tiny_index: TinyIndex):
 
				     def search(s: str):
			
 
				         results, terms = get_results(s)
			
 
				 
			
 
				+        pattern = get_query_regex(terms)
			
 
				         formatted_results = []
			
 
				         for result in results:
			
 
				-            pattern = get_query_regex(terms)
			
 
				             formatted_result = {}
			
 
				             for content_type, content in [('title', result.title), ('extract', result.extract)]:
			
 
				                 matches = re.finditer(pattern, content, re.IGNORECASE)