Index extracts
This commit is contained in:
parent
4fa1c4a39a
commit
16121d2b19
5 changed files with 20 additions and 18 deletions
16
index.py
16
index.py
|
@ -53,6 +53,7 @@ def clean(content):
|
|||
class Document:
|
||||
title: str
|
||||
url: str
|
||||
extract: str
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -181,13 +182,14 @@ def prepare_url_for_tokenizing(url: str):
|
|||
return url
|
||||
|
||||
|
||||
def get_pages(nlp, titles_and_urls) -> Iterable[TokenizedDocument]:
|
||||
for i, (title_cleaned, url) in enumerate(titles_and_urls):
|
||||
def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
|
||||
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
||||
title_tokens = tokenize(nlp, title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(nlp, prepared_url)
|
||||
tokens = title_tokens | url_tokens
|
||||
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
|
||||
extract_tokens = tokenize(nlp, extract)
|
||||
tokens = title_tokens | url_tokens | extract_tokens
|
||||
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
@ -201,14 +203,14 @@ def grouper(n: int, iterator: Iterator):
|
|||
yield chunk
|
||||
|
||||
|
||||
def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path):
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
|
||||
indexer.create_if_not_exists()
|
||||
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_and_urls)
|
||||
pages = get_pages(nlp, titles_urls_and_extracts)
|
||||
for page in pages:
|
||||
for token in page.tokens:
|
||||
indexer.index(token, Document(url=page.url, title=page.title))
|
||||
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
|
||||
terms.update([t.lower() for t in page.tokens])
|
||||
|
||||
term_df = pd.DataFrame({
|
||||
|
|
|
@ -4,7 +4,7 @@ Index items in the file-system queue
|
|||
from spacy.lang.en import English
|
||||
|
||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_and_urls
|
||||
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
|
||||
from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
|
||||
|
||||
|
||||
|
@ -23,7 +23,7 @@ def index_queue_items():
|
|||
nlp = English()
|
||||
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_queue_items()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
10
indexcc.py
10
indexcc.py
|
@ -8,7 +8,7 @@ from logging import getLogger
|
|||
import spacy
|
||||
|
||||
from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
|
||||
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES, Document
|
||||
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document
|
||||
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
|
||||
|
||||
|
||||
|
@ -20,11 +20,11 @@ def index_common_craw_data():
|
|||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_common_crawl_titles_and_urls()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls, COMMON_CRAWL_TERMS_PATH)
|
||||
titles_urls_and_extracts = get_common_crawl_titles_urls_and_extracts()
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, COMMON_CRAWL_TERMS_PATH)
|
||||
|
||||
|
||||
def get_common_crawl_titles_and_urls():
|
||||
def get_common_crawl_titles_urls_and_extracts():
|
||||
input_queue = FSQueue(DATA_DIR, 'search-items', GzipJsonRowSerializer())
|
||||
input_queue.unlock_all()
|
||||
while True:
|
||||
|
@ -40,7 +40,7 @@ def get_common_crawl_titles_and_urls():
|
|||
item_id, items = next_item
|
||||
logger.info(f'Processing item {item_id}')
|
||||
for url, title, extract in items:
|
||||
yield title, url
|
||||
yield title, url, extract
|
||||
input_queue.done(item_id)
|
||||
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ from starlette.testclient import TestClient
|
|||
|
||||
import create_app
|
||||
from fsqueue import ZstdJsonSerializer
|
||||
from index import TinyIndexer, index_titles_and_urls, Document, TinyIndex
|
||||
from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex
|
||||
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
|
@ -87,7 +87,7 @@ def performance_test():
|
|||
titles_and_urls = get_test_pages()
|
||||
|
||||
start_time = datetime.now()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
|
||||
stop_time = datetime.now()
|
||||
|
||||
index_time = (stop_time - start_time).total_seconds()
|
||||
|
|
4
wiki.py
4
wiki.py
|
@ -7,7 +7,7 @@ from urllib.parse import quote
|
|||
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
|
||||
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES
|
||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
|
@ -19,7 +19,7 @@ def index_wiki():
|
|||
nlp = English()
|
||||
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
|
||||
|
||||
|
||||
def get_wiki_titles_and_urls():
|
||||
|
|
Loading…
Reference in a new issue