Use terms and bigrams from the beginning of the string only
This commit is contained in:
parent
619b6c3a93
commit
f4fb9f831a
3 changed files with 37 additions and 20 deletions
|
@ -8,12 +8,15 @@ from urllib.parse import unquote
|
|||
import pandas as pd
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
|
||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
|
||||
DEFAULT_SCORE = 0
|
||||
|
||||
HTTP_START = 'http://'
|
||||
HTTPS_START = 'https://'
|
||||
BATCH_SIZE = 100
|
||||
NUM_FIRST_TOKENS = 3
|
||||
NUM_BIGRAMS = 5
|
||||
|
||||
|
||||
STOPWORDS = set("0,1,2,3,4,5,6,7,8,9,a,A,about,above,across,after,again,against,all,almost,alone,along,already,also," \
|
||||
|
@ -35,17 +38,6 @@ STOPWORDS = set("0,1,2,3,4,5,6,7,8,9,a,A,about,above,across,after,again,against,
|
|||
"yours,yourself,yourselves,you've,z,Z".split(','))
|
||||
|
||||
|
||||
def tokenize(input_text):
|
||||
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
|
||||
tokens = cleaned_text.lower().split()
|
||||
if input_text.endswith('…'):
|
||||
# Discard the last two tokens since there will likely be a word cut in two
|
||||
tokens = tokens[:-2]
|
||||
# content_tokens = [token for token in tokens if not token in STOPWORDS]
|
||||
# return content_tokens
|
||||
return tokens
|
||||
|
||||
|
||||
def prepare_url_for_tokenizing(url: str):
|
||||
if url.startswith(HTTP_START):
|
||||
url = url[len(HTTP_START):]
|
||||
|
@ -66,13 +58,23 @@ def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedD
|
|||
print("Processed", i)
|
||||
|
||||
|
||||
def get_index_tokens(tokens):
|
||||
first_tokens = tokens[:NUM_FIRST_TOKENS]
|
||||
bigrams = get_bigrams(NUM_BIGRAMS, tokens)
|
||||
return set(first_tokens + bigrams)
|
||||
|
||||
|
||||
def tokenize_document(url, title_cleaned, extract, score, nlp):
|
||||
title_tokens = tokenize(title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(prepared_url)
|
||||
extract_tokens = tokenize(extract)
|
||||
# print("Extract tokens", extract_tokens)
|
||||
tokens = set(title_tokens) | set(url_tokens) | set(extract_tokens)
|
||||
tokens = get_index_tokens(title_tokens) | get_index_tokens(url_tokens) | get_index_tokens(extract_tokens)
|
||||
# doc = Document(title_cleaned, url, extract, score)
|
||||
# token_scores = {token: score_result([token], doc, True) for token in tokens}
|
||||
# high_scoring_tokens = [k for k, v in token_scores.items() if v > 0.5]
|
||||
# print("High scoring", len(high_scoring_tokens), token_scores, doc)
|
||||
document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
|
||||
return document
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from logging import getLogger
|
|||
from operator import itemgetter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.indexer.index import tokenize
|
||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
@ -35,7 +35,7 @@ def _get_query_regex(terms, is_complete, is_url):
|
|||
return pattern
|
||||
|
||||
|
||||
def _score_result(terms: list[str], result: Document, is_complete: bool):
|
||||
def score_result(terms: list[str], result: Document, is_complete: bool):
|
||||
features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)
|
||||
|
||||
length_penalty = math.e ** (-LENGTH_PENALTY * len(result.url))
|
||||
|
@ -116,7 +116,7 @@ def order_results(terms: list[str], results: list[Document], is_complete: bool)
|
|||
if len(results) == 0:
|
||||
return []
|
||||
|
||||
results_and_scores = [(_score_result(terms, result, is_complete), result) for result in results]
|
||||
results_and_scores = [(score_result(terms, result, is_complete), result) for result in results]
|
||||
ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
|
||||
filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
|
||||
return filtered_results
|
||||
|
@ -181,16 +181,18 @@ class Ranker:
|
|||
completions = []
|
||||
retrieval_terms = set(terms)
|
||||
|
||||
bigrams = set(get_bigrams(len(terms), terms))
|
||||
|
||||
pages = []
|
||||
seen_items = set()
|
||||
for term in retrieval_terms:
|
||||
for term in retrieval_terms | bigrams:
|
||||
items = self.tiny_index.retrieve(term)
|
||||
if items is not None:
|
||||
for item in items:
|
||||
if term in item.title.lower() or term in item.extract.lower():
|
||||
if item.title not in seen_items:
|
||||
pages.append(item)
|
||||
seen_items.add(item.title)
|
||||
# if term in item.title.lower() or term in item.extract.lower():
|
||||
if item.title not in seen_items:
|
||||
pages.append(item)
|
||||
seen_items.add(item.title)
|
||||
|
||||
ordered_results = self.order_results(terms, pages, is_complete)
|
||||
return ordered_results, terms, completions
|
||||
|
|
13
mwmbl/tokenizer.py
Normal file
13
mwmbl/tokenizer.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
def tokenize(input_text):
|
||||
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
|
||||
tokens = cleaned_text.lower().split()
|
||||
if input_text.endswith('…'):
|
||||
# Discard the last two tokens since there will likely be a word cut in two
|
||||
tokens = tokens[:-2]
|
||||
return tokens
|
||||
|
||||
|
||||
def get_bigrams(num_bigrams, tokens):
|
||||
num_bigrams = min(num_bigrams, len(tokens) - 1)
|
||||
bigrams = [f'{tokens[i]} {tokens[i + 1]}' for i in range(num_bigrams)]
|
||||
return bigrams
|
Loading…
Reference in a new issue