mwmbl/index.py

"""
Create a search index
"""
from collections import Counter
from itertools import islice
from typing import Iterator, Iterable
from urllib.parse import unquote

import pandas as pd

# NUM_PAGES = 8192
# PAGE_SIZE = 512
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument

NUM_INITIAL_TOKENS = 50

HTTP_START = 'http://'
HTTPS_START = 'https://'
BATCH_SIZE = 100


def is_content_token(nlp, token):
    lexeme = nlp.vocab[token.orth]
    return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop


def tokenize(nlp, cleaned_text):
    tokens = nlp.tokenizer(cleaned_text)
    content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
                      if is_content_token(nlp, token)]
    lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
    return lowered


def prepare_url_for_tokenizing(url: str):
    if url.startswith(HTTP_START):
        url = url[len(HTTP_START):]
    elif url.startswith(HTTPS_START):
        url = url[len(HTTPS_START):]
    for c in '/._':
        if c in url:
            url = url.replace(c, ' ')
    return url


def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
    for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
        title_tokens = tokenize(nlp, title_cleaned)
        prepared_url = prepare_url_for_tokenizing(unquote(url))
        url_tokens = tokenize(nlp, prepared_url)
        extract_tokens = tokenize(nlp, extract)
        print("Extract tokens", extract_tokens)
        tokens = title_tokens | url_tokens | extract_tokens
        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)

        if i % 1000 == 0:
            print("Processed", i)


def grouper(n: int, iterator: Iterator):
    while True:
        chunk = tuple(islice(iterator, n))
        if not chunk:
            return
        yield chunk


def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
    indexer.create_if_not_exists()

    terms = Counter()
    pages = get_pages(nlp, titles_urls_and_extracts)
    for page in pages:
        for token in page.tokens:
            indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
        terms.update([t.lower() for t in page.tokens])

    term_df = pd.DataFrame({
        'term': terms.keys(),
        'count': terms.values(),
    })
    term_df.to_csv(terms_path)
Initial commit 2021-03-13 20:54:15 +00:00			`"""`
			`Create a search index`
			`"""`
Count terms 2021-05-30 20:30:34 +00:00			`from collections import Counter`
Index queued items 2021-05-19 20:48:03 +00:00			`from itertools import islice`
WIP: implement docker image. TODO: copy index and set the correct index path using env var 2021-12-22 23:21:23 +00:00			`from typing import Iterator, Iterable`
Improve indexing; measure performance 2021-03-23 22:03:48 +00:00			`from urllib.parse import unquote`
Initial commit 2021-03-13 20:54:15 +00:00
Count terms 2021-05-30 20:30:34 +00:00			`import pandas as pd`
Initial commit 2021-03-13 20:54:15 +00:00
Add an error state 2021-12-14 19:59:31 +00:00			`# NUM_PAGES = 8192`
			`# PAGE_SIZE = 512`
WIP: implement docker image. TODO: copy index and set the correct index path using env var 2021-12-22 23:21:23 +00:00			`from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument`
Index using compression 2021-04-12 17:37:33 +00:00
Create index 2021-03-13 22:21:50 +00:00			`NUM_INITIAL_TOKENS = 50`
Initial commit 2021-03-13 20:54:15 +00:00
Improve indexing; measure performance 2021-03-23 22:03:48 +00:00			`HTTP_START = 'http://'`
			`HTTPS_START = 'https://'`
Index using compression 2021-04-12 17:37:33 +00:00			`BATCH_SIZE = 100`
Improve indexing; measure performance 2021-03-23 22:03:48 +00:00
Initial commit 2021-03-13 20:54:15 +00:00
			`def is_content_token(nlp, token):`
			`lexeme = nlp.vocab[token.orth]`
Improve indexing; measure performance 2021-03-23 22:03:48 +00:00			`return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop`
Initial commit 2021-03-13 20:54:15 +00:00

			`def tokenize(nlp, cleaned_text):`
			`tokens = nlp.tokenizer(cleaned_text)`
Create index 2021-03-13 22:21:50 +00:00			`content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]`
			`if is_content_token(nlp, token)]`
Initial commit 2021-03-13 20:54:15 +00:00			`lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}`
			`return lowered`


Improve indexing; measure performance 2021-03-23 22:03:48 +00:00			`def prepare_url_for_tokenizing(url: str):`
			`if url.startswith(HTTP_START):`
			`url = url[len(HTTP_START):]`
			`elif url.startswith(HTTPS_START):`
			`url = url[len(HTTPS_START):]`
			`for c in '/._':`
			`if c in url:`
			`url = url.replace(c, ' ')`
			`return url`


Index extracts 2021-12-18 22:56:39 +00:00			`def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:`
			`for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):`
Improve indexing; measure performance 2021-03-23 22:03:48 +00:00			`title_tokens = tokenize(nlp, title_cleaned)`
			`prepared_url = prepare_url_for_tokenizing(unquote(url))`
			`url_tokens = tokenize(nlp, prepared_url)`
Index extracts 2021-12-18 22:56:39 +00:00			`extract_tokens = tokenize(nlp, extract)`
Show the extract 2021-12-19 20:48:28 +00:00			`print("Extract tokens", extract_tokens)`
Index extracts 2021-12-18 22:56:39 +00:00			`tokens = title_tokens \| url_tokens \| extract_tokens`
			`yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)`
Improve indexing; measure performance 2021-03-23 22:03:48 +00:00
			`if i % 1000 == 0:`
			`print("Processed", i)`
Initial commit 2021-03-13 20:54:15 +00:00

Speed up inserts 2021-03-24 21:55:35 +00:00			`def grouper(n: int, iterator: Iterator):`
			`while True:`
			`chunk = tuple(islice(iterator, n))`
			`if not chunk:`
			`return`
			`yield chunk`


Index extracts 2021-12-18 22:56:39 +00:00			`def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):`
Speed up inserts 2021-03-24 21:55:35 +00:00			`indexer.create_if_not_exists()`

Count terms 2021-05-30 20:30:34 +00:00			`terms = Counter()`
Index extracts 2021-12-18 22:56:39 +00:00			`pages = get_pages(nlp, titles_urls_and_extracts)`
Abstract index to allow storing anything 2021-06-05 21:22:31 +00:00			`for page in pages:`
			`for token in page.tokens:`
Index extracts 2021-12-18 22:56:39 +00:00			`indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))`
Abstract index to allow storing anything 2021-06-05 21:22:31 +00:00			`terms.update([t.lower() for t in page.tokens])`
Count terms 2021-05-30 20:30:34 +00:00
			`term_df = pd.DataFrame({`
			`'term': terms.keys(),`
			`'count': terms.values(),`
			`})`
			`term_df.to_csv(terms_path)`