Index queued items

2021-05-19 21:48:03 +01:00 · 2021-05-19 21:48:03 +01:00 · 974f18647a
commit 974f18647a
parent 87fd458218
4 changed files with 77 additions and 37 deletions
--- a/fsqueue.py
+++ b/fsqueue.py
@ -95,3 +95,11 @@ class FSQueue:
        """

        self._move(item_id, FSState.LOCKED, FSState.DONE)
+
+    def unlock_all(self):
+        paths = sorted(Path(self._get_dir(FSState.LOCKED)).iterdir(), key=os.path.getmtime)
+
+        for path in paths:
+            # Try and lock the file
+            self._move(path.name, FSState.LOCKED, FSState.READY)
+
--- a/index.py
+++ b/index.py
@ -1,25 +1,18 @@
 """
 Create a search index
 """
-import gzip
 import json
 import os
-import sqlite3
 from dataclasses import dataclass
-from glob import glob
-from itertools import chain, count, islice
+from itertools import islice
 from mmap import mmap, PROT_READ
 from typing import List, Iterator
 from urllib.parse import unquote

-import bs4
 import justext
 import mmh3
-from spacy.lang.en import English
 from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError

-from paths import CRAWL_GLOB, INDEX_PATH
-
 NUM_PAGES = 8192
 PAGE_SIZE = 512

@ -160,32 +153,6 @@ class TinyIndexer(TinyIndexBase):
        raise NotImplementedError()


-def run():
-    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
-    indexer.create_if_not_exists()
-    nlp = English()
-    for path in glob(CRAWL_GLOB):
-        print("Path", path)
-        with gzip.open(path, 'rt') as html_file:
-            url = html_file.readline().strip()
-            content = html_file.read()
-
-        if indexer.document_indexed(url):
-            print("Page exists, skipping", url)
-            continue
-
-        cleaned_text = clean(content)
-        try:
-            title = bs4.BeautifulSoup(content, features="lxml").find('title').string
-        except AttributeError:
-            title = cleaned_text[:80]
-        tokens = tokenize(nlp, cleaned_text)
-        print("URL", url)
-        print("Tokens", tokens)
-        print("Title", title)
-        indexer.index(tokens, url, title)
-
-
 def prepare_url_for_tokenizing(url: str):
    if url.startswith(HTTP_START):
        url = url[len(HTTP_START):]
@ -224,6 +191,3 @@ def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
    for chunk in grouper(BATCH_SIZE, pages):
        indexer.index(list(chunk))

-
-if __name__ == '__main__':
-    run()
--- a/index_glob.py
+++ b/index_glob.py
@ -0,0 +1,38 @@
+import gzip
+from glob import glob
+
+import bs4
+from spacy.lang.en import English
+
+from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
+from paths import INDEX_PATH, CRAWL_GLOB
+
+
+def run():
+    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
+    indexer.create_if_not_exists()
+    nlp = English()
+    for path in glob(CRAWL_GLOB):
+        print("Path", path)
+        with gzip.open(path, 'rt') as html_file:
+            url = html_file.readline().strip()
+            content = html_file.read()
+
+        if indexer.document_indexed(url):
+            print("Page exists, skipping", url)
+            continue
+
+        cleaned_text = clean(content)
+        try:
+            title = bs4.BeautifulSoup(content, features="lxml").find('title').string
+        except AttributeError:
+            title = cleaned_text[:80]
+        tokens = tokenize(nlp, cleaned_text)
+        print("URL", url)
+        print("Tokens", tokens)
+        print("Title", title)
+        indexer.index(tokens, url, title)
+
+
+if __name__ == '__main__':
+    run()
--- a/index_queue.py
+++ b/index_queue.py
@ -0,0 +1,30 @@
+"""
+Index items in the file-system queue
+"""
+from spacy.lang.en import English
+
+from fsqueue import FSQueue, ZstdJsonSerializer
+from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_and_urls
+from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
+
+
+def get_queue_items():
+    titles_queue = FSQueue(DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer())
+    titles_queue.unlock_all()
+    while True:
+        items_id, items = titles_queue.get()
+        for item in items:
+            if item['title'] is None:
+                continue
+            yield item['title'], item['url']
+
+
+def index_queue_items():
+    nlp = English()
+    with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+        titles_and_urls = get_queue_items()
+        index_titles_and_urls(indexer, nlp, titles_and_urls)
+
+
+if __name__ == '__main__':
+    index_queue_items()