diff --git a/Dockerfile b/Dockerfile index 533ef6d..4ac39f6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim-bullseye as base +FROM python:3.10.2-bullseye as base ENV PYTHONFAULTHANDLER=1 \ PYTHONHASHSEED=random \ diff --git a/analyse/analyse_crawled_domains.py b/analyse/analyse_crawled_domains.py new file mode 100644 index 0000000..8641f61 --- /dev/null +++ b/analyse/analyse_crawled_domains.py @@ -0,0 +1,53 @@ +""" +See how many unique URLs and root domains we have crawled. +""" +import glob +import gzip +import json +from collections import defaultdict, Counter +from urllib.parse import urlparse + +CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz" + + +def get_urls(): + for path in glob.glob(CRAWL_GLOB): + data = json.load(gzip.open(path)) + user = data['user_id_hash'] + for item in data['items']: + yield user, item['url'] + + +def analyse_urls(urls): + url_set = defaultdict(list) + domains = set() + for user, url in urls: + url_set[url].append(user) + + parsed_url = urlparse(url) + path = parsed_url.path.strip('/') + if path == '': + domains.add(parsed_url.netloc) + + count = sum(len(x) for x in url_set.values()) + print("Root pages crawled", sorted(domains)) + find_worst_pages(url_set) + print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items") + url_list_size = len(json.dumps(list(url_set.keys()))) + print("Length of all URLs", url_list_size) + + +def find_worst_pages(url_set): + worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50] + for count, url in worst: + print("Worst", count, url, Counter(url_set[url])) + + +def run(): + urls = get_urls() + analyse_urls(urls) + + +if __name__ == '__main__': + run() + diff --git a/analyse/export_top_domains.py b/analyse/export_top_domains.py new file mode 100644 index 0000000..2794804 --- /dev/null +++ b/analyse/export_top_domains.py @@ -0,0 +1,13 @@ +import json + +from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH +from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS + + +def export_top_domains_to_json(): + with open(TOP_DOMAINS_JSON_PATH, 'w') as output_file: + json.dump(DOMAINS, output_file, indent=2) + + +if __name__ == '__main__': + export_top_domains_to_json() diff --git a/analyse/top_links.py b/analyse/top_links.py new file mode 100644 index 0000000..012c44f --- /dev/null +++ b/analyse/top_links.py @@ -0,0 +1,39 @@ +""" +Analyse crawl data to find the most popular links +""" +import glob +import gzip +import json +from collections import defaultdict +from urllib.parse import urlparse + +from analyse.analyse_crawled_domains import CRAWL_GLOB + + +def get_urls(): + for path in glob.glob(CRAWL_GLOB): + data = json.load(gzip.open(path)) + for item in data['items']: + url = item['url'] + domain = urlparse(url).hostname + for link in item['links']: + yield domain, link + + +def collect_links(urls): + links = defaultdict(set) + for url, link in urls: + links[link].add(url) + return links + + +def run(): + url_links = get_urls() + collected = collect_links(url_links) + top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000] + for url, items in top: + print("URL", url, len(items)) + + +if __name__ == '__main__': + run() diff --git a/mwmbl/indexer/fsqueue.py b/mwmbl/indexer/fsqueue.py index d81d869..f05b2d4 100644 --- a/mwmbl/indexer/fsqueue.py +++ b/mwmbl/indexer/fsqueue.py @@ -58,6 +58,15 @@ class GzipJsonRowSerializer(Serializer): return [json.loads(line) for line in lines.strip().split('\n')] +class GzipJsonBlobSerializer(Serializer): + def serialize(self, items: list[object]) -> bytes: + raise NotImplementedError("Serializer not needed - blob is generated by browser extension") + + def deserialize(self, serialized_items: bytes) -> list[object]: + data = gzip.decompress(serialized_items).decode('utf8') + return json.loads(data) + + class FSQueue: def __init__(self, directory: Union[str, Path], name: str, serializer: Serializer): self.directory = str(directory) diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index d0f0efe..8bd0dc9 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -12,8 +12,6 @@ import pandas as pd # PAGE_SIZE = 512 from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument -NUM_INITIAL_TOKENS = 50 - HTTP_START = 'http://' HTTPS_START = 'https://' BATCH_SIZE = 100 @@ -24,10 +22,13 @@ def is_content_token(nlp, token): return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop -def tokenize(nlp, cleaned_text): +def tokenize(nlp, input_text): + cleaned_text = input_text.encode('utf8', 'replace').decode('utf8') tokens = nlp.tokenizer(cleaned_text) - content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS] - if is_content_token(nlp, token)] + if input_text.endswith('…'): + # Discard the last two tokens since there will likely be a word cut in two + tokens = tokens[:-2] + content_tokens = [token for token in tokens if is_content_token(nlp, token)] lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens} return lowered diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py new file mode 100644 index 0000000..3c30482 --- /dev/null +++ b/mwmbl/indexer/index_crawl.py @@ -0,0 +1,46 @@ +""" +Index data crawled through the Mwmbl crawler. +""" +from logging import getLogger + +import spacy + +from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError +from mwmbl.indexer.index import index_titles_urls_and_extracts +from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR +from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE + + +logger = getLogger(__name__) + + +def index_mwmbl_craw_data(): + nlp = spacy.load("en_core_web_sm") + + with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: + titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts() + index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH) + + +def get_mwmbl_crawl_titles_urls_and_extracts(): + input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer()) + input_queue.unlock_all() + while True: + try: + next_item = input_queue.get() + except FSQueueError as e: + logger.exception(f'Error with item {e.item_id}') + input_queue.error(e.item_id) + continue + if next_item is None: + logger.info('Not more items to process, stopping') + break + item_id, item_data = next_item + logger.info(f'Processing item {item_id}') + for item in item_data['items']: + yield item['title'], item['url'], item['extract'] + input_queue.done(item_id) + + +if __name__ == '__main__': + index_mwmbl_craw_data() diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py index f9cf1e0..c372021 100644 --- a/mwmbl/indexer/paths.py +++ b/mwmbl/indexer/paths.py @@ -19,4 +19,8 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs' DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') -INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch' +LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data' +INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch' +MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv' + +TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json' diff --git a/mwmbl/tinysearchengine/app.py b/mwmbl/tinysearchengine/app.py index e692284..d0cde72 100644 --- a/mwmbl/tinysearchengine/app.py +++ b/mwmbl/tinysearchengine/app.py @@ -1,8 +1,11 @@ import logging import argparse + +import pandas as pd import uvicorn from mwmbl.tinysearchengine import create_app +from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document from mwmbl.tinysearchengine.config import parse_config_file @@ -28,8 +31,12 @@ def main(): """ config, tiny_index = get_config_and_index() + # Load term data + terms = pd.read_csv(config.terms_path) + completer = Completer(terms) + # Initialize FastApi instance - app = create_app.create(tiny_index) + app = create_app.create(tiny_index, completer) # Initialize uvicorn server using global app instance and server config params uvicorn.run(app, **config.server_config.dict()) diff --git a/mwmbl/tinysearchengine/completer.py b/mwmbl/tinysearchengine/completer.py new file mode 100644 index 0000000..2336fb5 --- /dev/null +++ b/mwmbl/tinysearchengine/completer.py @@ -0,0 +1,37 @@ +from bisect import bisect_left, bisect_right +from datetime import datetime + +import pandas as pd +from pandas import DataFrame + + +class Completer: + def __init__(self, terms: DataFrame, num_matches: int = 3): + terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict() + self.terms = list(terms_dict.keys()) + self.counts = list(terms_dict.values()) + self.num_matches = num_matches + print("Terms", self.terms[:100], self.counts[:100]) + + def complete(self, term) -> list[str]: + term_length = len(term) + start_index = bisect_left(self.terms, term, key=lambda x: x[:term_length]) + end_index = bisect_right(self.terms, term, key=lambda x: x[:term_length]) + + matching_terms = zip(self.counts[start_index:end_index], self.terms[start_index:end_index]) + top_terms = sorted(matching_terms, reverse=True)[:self.num_matches] + print("Top terms, counts", top_terms) + if not top_terms: + return [] + + counts, terms = zip(*top_terms) + return list(terms) + + +if __name__ == '__main__': + data = pd.read_csv('data/mwmbl-crawl-terms.csv') + completer = Completer(data) + start = datetime.now() + completer.complete('fa') + end = datetime.now() + print("Time", end - start) diff --git a/mwmbl/tinysearchengine/config.py b/mwmbl/tinysearchengine/config.py index c506117..2fd6f54 100644 --- a/mwmbl/tinysearchengine/config.py +++ b/mwmbl/tinysearchengine/config.py @@ -18,6 +18,7 @@ class IndexConfigModel(BaseModel): class ConfigModel(BaseModel): server_config: ServerConfigModel = Field(default_factory=ServerConfigModel) index_config: IndexConfigModel = Field(default_factory=IndexConfigModel) + terms_path: StrictStr = "data/mwmbl-crawl-terms.csv" def parse_config_file(config_filename: str) -> ConfigModel: diff --git a/mwmbl/tinysearchengine/create_app.py b/mwmbl/tinysearchengine/create_app.py index 538a539..8e54df1 100644 --- a/mwmbl/tinysearchengine/create_app.py +++ b/mwmbl/tinysearchengine/create_app.py @@ -6,20 +6,18 @@ from urllib.parse import urlparse from fastapi import FastAPI from starlette.middleware.cors import CORSMiddleware -from starlette.responses import FileResponse -from starlette.staticfiles import StaticFiles +from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS from mwmbl.tinysearchengine.indexer import TinyIndex, Document logger = getLogger(__name__) -STATIC_FILES_PATH = Path(__file__).parent / 'static' SCORE_THRESHOLD = 0.25 -def create(tiny_index: TinyIndex): +def create(tiny_index: TinyIndex, completer: Completer): app = FastAPI() # Allow CORS requests from any site @@ -33,9 +31,10 @@ def create(tiny_index: TinyIndex): def search(s: str): results, terms = get_results(s) + is_complete = s.endswith(' ') + pattern = get_query_regex(terms, is_complete) formatted_results = [] for result in results: - pattern = get_query_regex(terms) formatted_result = {} for content_type, content in [('title', result.title), ('extract', result.extract)]: matches = re.finditer(pattern, content, re.IGNORECASE) @@ -53,17 +52,23 @@ def create(tiny_index: TinyIndex): logger.info("Return results: %r", formatted_results) return formatted_results - def get_query_regex(terms): - term_patterns = [rf'\b{term}\b' for term in terms] + def get_query_regex(terms, is_complete): + if not terms: + return '' + + if is_complete: + term_patterns = [rf'\b{term}\b' for term in terms] + else: + term_patterns = [rf'\b{term}\b' for term in terms[:-1]] + [rf'\b{terms[-1]}'] pattern = '|'.join(term_patterns) return pattern - def score_result(terms, result: Document): + def score_result(terms, result: Document, is_complete: bool): domain = urlparse(result.url).netloc domain_score = DOMAINS.get(domain, 0.0) result_string = f"{result.title.strip()} {result.extract.strip()}" - query_regex = get_query_regex(terms) + query_regex = get_query_regex(terms, is_complete) matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE)) match_strings = {x.group(0).lower() for x in matches} match_length = sum(len(x) for x in match_strings) @@ -80,8 +85,8 @@ def create(tiny_index: TinyIndex): score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1) return score - def order_results(terms: list[str], results: list[Document]): - results_and_scores = [(score_result(terms, result), result) for result in results] + def order_results(terms: list[str], results: list[Document], is_complete: bool): + results_and_scores = [(score_result(terms, result, is_complete), result) for result in results] ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True) filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD] return filtered_results @@ -97,9 +102,15 @@ def create(tiny_index: TinyIndex): def get_results(q): terms = [x.lower() for x in q.replace('.', ' ').split()] + is_complete = q.endswith(' ') + if len(terms) > 0 and not is_complete: + retrieval_terms = terms[:-1] + completer.complete(terms[-1]) + else: + retrieval_terms = terms + pages = [] seen_items = set() - for term in terms: + for term in retrieval_terms: items = tiny_index.retrieve(term) if items is not None: for item in items: @@ -108,12 +119,6 @@ def create(tiny_index: TinyIndex): pages.append(item) seen_items.add(item.title) - ordered_results = order_results(terms, pages) + ordered_results = order_results(terms, pages, is_complete) return ordered_results, terms - - @app.get('/') - def index(): - return FileResponse(STATIC_FILES_PATH / 'index.html') - - app.mount('/', StaticFiles(directory=STATIC_FILES_PATH), name="static") return app diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index d5fe684..33ebfc0 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -41,7 +41,6 @@ class TinyIndexBase(Generic[T]): page = self.get_page(index) if page is None: return [] - # print("REtrieve", self.index_path, page) return self.convert_items(page) def _get_key_page_index(self, key): @@ -53,25 +52,21 @@ class TinyIndexBase(Generic[T]): Get the page at index i, decompress and deserialise it using JSON """ page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size] - zeros = page_data.count(b'\x00\x00\x00\x00') * 4 try: decompressed_data = self.decompressor.decompress(page_data) except ZstdError: return None results = json.loads(decompressed_data.decode('utf8')) - # print(f"Num results: {len(results)}, num zeros: {zeros}") return results def convert_items(self, items) -> List[T]: converted = [self.item_factory(*item) for item in items] - # print("Converted", items, converted) return converted class TinyIndex(TinyIndexBase[T]): def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size): super().__init__(item_factory, num_pages, page_size) - # print("REtrieve path", index_path) self.index_path = index_path self.index_file = open(self.index_path, 'rb') self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ) diff --git a/mwmbl/tinysearchengine/static/index.css b/mwmbl/tinysearchengine/static/index.css deleted file mode 100644 index d6e2348..0000000 --- a/mwmbl/tinysearchengine/static/index.css +++ /dev/null @@ -1,82 +0,0 @@ -html { - font-family: Verdana, Geneva, sans-serif; - background: #dcdced; -} - -body { - font-size: 1.2rem; -} - -p { - width: 100%; - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - margin: 3px; -} - -div { - margin-top: 15px; - margin-bottom: 15px; -} - -.url { - margin-top: 0px; - font-size: 1rem; -} - -.container { - width: 100%; - max-width: 1024px; - margin: 0 auto; -} - - -#search { - display: block; - width: 100%; - - outline: none; - - font-size: inherit; - - border: 2px solid #ccc; - border-width: 4px; - border-radius: 50px; - - padding: 10px; - padding-left: 35px; - - margin-top: 50px; -} - -a { - text-decoration: none; - color: #555555; -} - -div .result:hover { - background: #f0f0ff; -} - -div .selected { - background: #f0f0ff; -} - -div .result { - padding: 10px; -} - - - -span .term { - font-weight: bold; -} - -.title { - color: #1a0daa; - /* color: black; */ -} - -.extract { -} diff --git a/mwmbl/tinysearchengine/static/index.html b/mwmbl/tinysearchengine/static/index.html deleted file mode 100644 index 8ab2c2e..0000000 --- a/mwmbl/tinysearchengine/static/index.html +++ /dev/null @@ -1,22 +0,0 @@ - -
- -