瀏覽代碼

Index extracts

Daoud Clarke 3 年之前
父節點
當前提交
16121d2b19
共有 5 個文件被更改,包括 20 次插入18 次删除
  1. 9 7
      index.py
  2. 2 2
      index_queue.py
  3. 5 5
      indexcc.py
  4. 2 2
      performance.py
  5. 2 2
      wiki.py

+ 9 - 7
index.py

@@ -53,6 +53,7 @@ def clean(content):
 class Document:
     title: str
     url: str
+    extract: str
 
 
 @dataclass
@@ -181,13 +182,14 @@ def prepare_url_for_tokenizing(url: str):
     return url
 
 
-def get_pages(nlp, titles_and_urls) -> Iterable[TokenizedDocument]:
-    for i, (title_cleaned, url) in enumerate(titles_and_urls):
+def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
+    for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
         title_tokens = tokenize(nlp, title_cleaned)
         prepared_url = prepare_url_for_tokenizing(unquote(url))
         url_tokens = tokenize(nlp, prepared_url)
-        tokens = title_tokens | url_tokens
-        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
+        extract_tokens = tokenize(nlp, extract)
+        tokens = title_tokens | url_tokens | extract_tokens
+        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
 
         if i % 1000 == 0:
             print("Processed", i)
@@ -201,14 +203,14 @@ def grouper(n: int, iterator: Iterator):
         yield chunk
 
 
-def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path):
+def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
     indexer.create_if_not_exists()
 
     terms = Counter()
-    pages = get_pages(nlp, titles_and_urls)
+    pages = get_pages(nlp, titles_urls_and_extracts)
     for page in pages:
         for token in page.tokens:
-            indexer.index(token, Document(url=page.url, title=page.title))
+            indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
         terms.update([t.lower() for t in page.tokens])
 
     term_df = pd.DataFrame({

+ 2 - 2
index_queue.py

@@ -4,7 +4,7 @@ Index items in the file-system queue
 from spacy.lang.en import English
 
 from fsqueue import FSQueue, ZstdJsonSerializer
-from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_and_urls
+from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
 from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
 
 
@@ -23,7 +23,7 @@ def index_queue_items():
     nlp = English()
     with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
         titles_and_urls = get_queue_items()
-        index_titles_and_urls(indexer, nlp, titles_and_urls)
+        index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
 
 
 if __name__ == '__main__':

+ 5 - 5
indexcc.py

@@ -8,7 +8,7 @@ from logging import getLogger
 import spacy
 
 from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
-from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES, Document
+from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document
 from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
 
 
@@ -20,11 +20,11 @@ def index_common_craw_data():
     nlp = spacy.load("en_core_web_sm")
 
     with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
-        titles_and_urls = get_common_crawl_titles_and_urls()
-        index_titles_and_urls(indexer, nlp, titles_and_urls, COMMON_CRAWL_TERMS_PATH)
+        titles_urls_and_extracts = get_common_crawl_titles_urls_and_extracts()
+        index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, COMMON_CRAWL_TERMS_PATH)
 
 
-def get_common_crawl_titles_and_urls():
+def get_common_crawl_titles_urls_and_extracts():
     input_queue = FSQueue(DATA_DIR, 'search-items', GzipJsonRowSerializer())
     input_queue.unlock_all()
     while True:
@@ -40,7 +40,7 @@ def get_common_crawl_titles_and_urls():
         item_id, items = next_item
         logger.info(f'Processing item {item_id}')
         for url, title, extract in items:
-            yield title, url
+            yield title, url, extract
         input_queue.done(item_id)
 
 

+ 2 - 2
performance.py

@@ -10,7 +10,7 @@ from starlette.testclient import TestClient
 
 import create_app
 from fsqueue import ZstdJsonSerializer
-from index import TinyIndexer, index_titles_and_urls, Document, TinyIndex
+from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex
 from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
 
 NUM_DOCUMENTS = 30000
@@ -87,7 +87,7 @@ def performance_test():
         titles_and_urls = get_test_pages()
 
         start_time = datetime.now()
-        index_titles_and_urls(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
+        index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
         stop_time = datetime.now()
 
         index_time = (stop_time - start_time).total_seconds()

+ 2 - 2
wiki.py

@@ -7,7 +7,7 @@ from urllib.parse import quote
 
 from spacy.lang.en import English
 
-from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
+from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES
 from paths import WIKI_TITLES_PATH, INDEX_PATH
 
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
@@ -19,7 +19,7 @@ def index_wiki():
     nlp = English()
     with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
         titles_and_urls = get_wiki_titles_and_urls()
-        index_titles_and_urls(indexer, nlp, titles_and_urls)
+        index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
 
 
 def get_wiki_titles_and_urls():