Use terms and bigrams from the beginning of the string only

This commit is contained in:
Daoud Clarke 2022-08-26 17:20:11 +01:00
parent 619b6c3a93
commit f4fb9f831a
3 changed files with 37 additions and 20 deletions

View file

@ -8,12 +8,15 @@ from urllib.parse import unquote
import pandas as pd
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
from mwmbl.tokenizer import tokenize, get_bigrams
DEFAULT_SCORE = 0
HTTP_START = 'http://'
HTTPS_START = 'https://'
BATCH_SIZE = 100
NUM_FIRST_TOKENS = 3
NUM_BIGRAMS = 5
STOPWORDS = set("0,1,2,3,4,5,6,7,8,9,a,A,about,above,across,after,again,against,all,almost,alone,along,already,also," \
@ -35,17 +38,6 @@ STOPWORDS = set("0,1,2,3,4,5,6,7,8,9,a,A,about,above,across,after,again,against,
"yours,yourself,yourselves,you've,z,Z".split(','))
def tokenize(input_text):
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
tokens = cleaned_text.lower().split()
if input_text.endswith(''):
# Discard the last two tokens since there will likely be a word cut in two
tokens = tokens[:-2]
# content_tokens = [token for token in tokens if not token in STOPWORDS]
# return content_tokens
return tokens
def prepare_url_for_tokenizing(url: str):
if url.startswith(HTTP_START):
url = url[len(HTTP_START):]
@ -66,13 +58,23 @@ def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedD
print("Processed", i)
def get_index_tokens(tokens):
first_tokens = tokens[:NUM_FIRST_TOKENS]
bigrams = get_bigrams(NUM_BIGRAMS, tokens)
return set(first_tokens + bigrams)
def tokenize_document(url, title_cleaned, extract, score, nlp):
title_tokens = tokenize(title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(prepared_url)
extract_tokens = tokenize(extract)
# print("Extract tokens", extract_tokens)
tokens = set(title_tokens) | set(url_tokens) | set(extract_tokens)
tokens = get_index_tokens(title_tokens) | get_index_tokens(url_tokens) | get_index_tokens(extract_tokens)
# doc = Document(title_cleaned, url, extract, score)
# token_scores = {token: score_result([token], doc, True) for token in tokens}
# high_scoring_tokens = [k for k, v in token_scores.items() if v > 0.5]
# print("High scoring", len(high_scoring_tokens), token_scores, doc)
document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
return document

View file

@ -5,7 +5,7 @@ from logging import getLogger
from operator import itemgetter
from urllib.parse import urlparse
from mwmbl.indexer.index import tokenize
from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
@ -35,7 +35,7 @@ def _get_query_regex(terms, is_complete, is_url):
return pattern
def _score_result(terms: list[str], result: Document, is_complete: bool):
def score_result(terms: list[str], result: Document, is_complete: bool):
features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)
length_penalty = math.e ** (-LENGTH_PENALTY * len(result.url))
@ -116,7 +116,7 @@ def order_results(terms: list[str], results: list[Document], is_complete: bool)
if len(results) == 0:
return []
results_and_scores = [(_score_result(terms, result, is_complete), result) for result in results]
results_and_scores = [(score_result(terms, result, is_complete), result) for result in results]
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
return filtered_results
@ -181,13 +181,15 @@ class Ranker:
completions = []
retrieval_terms = set(terms)
bigrams = set(get_bigrams(len(terms), terms))
pages = []
seen_items = set()
for term in retrieval_terms:
for term in retrieval_terms | bigrams:
items = self.tiny_index.retrieve(term)
if items is not None:
for item in items:
if term in item.title.lower() or term in item.extract.lower():
# if term in item.title.lower() or term in item.extract.lower():
if item.title not in seen_items:
pages.append(item)
seen_items.add(item.title)

13
mwmbl/tokenizer.py Normal file
View file

@ -0,0 +1,13 @@
def tokenize(input_text):
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
tokens = cleaned_text.lower().split()
if input_text.endswith(''):
# Discard the last two tokens since there will likely be a word cut in two
tokens = tokens[:-2]
return tokens
def get_bigrams(num_bigrams, tokens):
num_bigrams = min(num_bigrams, len(tokens) - 1)
bigrams = [f'{tokens[i]} {tokens[i + 1]}' for i in range(num_bigrams)]
return bigrams