From 2844c1df75dd6d83020770dbbfa411fa1e6c491c Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Mon, 13 Dec 2021 11:23:01 +0000 Subject: [PATCH] Index common crawl data --- extract_local.py | 6 +----- indexcc.py | 46 ++++++++++++++++++++-------------------------- paths.py | 7 ++++++- poetry.lock | 38 +++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 5 files changed, 65 insertions(+), 33 deletions(-) diff --git a/extract_local.py b/extract_local.py index 0aef839..040883f 100644 --- a/extract_local.py +++ b/extract_local.py @@ -3,14 +3,10 @@ import json import os from glob import glob from multiprocessing import Process, Lock -from pathlib import Path -from time import sleep from extract_process import fetch_process_warc_records from fsqueue import FSQueue, GzipJsonRowSerializer - -DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch' -EXTRACTS_PATH = DATA_DIR / 'extracts' +from paths import DATA_DIR ARCHIVE_INFO_GLOB = 'outputs/records/*.gz' diff --git a/indexcc.py b/indexcc.py index 1f4c4e4..574be5a 100644 --- a/indexcc.py +++ b/indexcc.py @@ -1,38 +1,32 @@ """ -Index Wikipedia +Index data downloaded from Common Crawl """ -import gzip -import html -from urllib.parse import quote -from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES -from paths import WIKI_TITLES_PATH, INDEX_PATH +import spacy -TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] -TITLE_START = 'Wikipedia: ' -TITLE_END = '\n' +from fsqueue import FSQueue, GzipJsonRowSerializer +from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES, Document +from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH -def index_wiki(): +def index_common_craw_data(): nlp = spacy.load("en_core_web_sm") - with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: - titles_and_urls = get_wiki_titles_and_urls() - index_titles_and_urls(indexer, nlp, titles_and_urls) + + with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: + titles_and_urls = get_common_crawl_titles_and_urls() + index_titles_and_urls(indexer, nlp, titles_and_urls, COMMON_CRAWL_TERMS_PATH) -def get_wiki_titles_and_urls(): - start_len = len(TITLE_START) - end_len = len(TITLE_END) - with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file: - wiki_titles_file.readline() - for raw_title in wiki_titles_file: - assert raw_title.startswith(TITLE_START) - assert raw_title.endswith(TITLE_END) - title = raw_title[start_len:-end_len] - unescaped_title = html.unescape(title) - url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_')) - yield unescaped_title, url +def get_common_crawl_titles_and_urls(): + input_queue = FSQueue(DATA_DIR, 'search-items', GzipJsonRowSerializer()) + while True: + next_item = input_queue.get() + if next_item is None: + break + item_id, items = next_item + for url, title, extract in items: + yield title, url if __name__ == '__main__': - index_wiki() + index_common_craw_data() diff --git a/paths.py b/paths.py index 88335df..39d30a8 100644 --- a/paths.py +++ b/paths.py @@ -1,7 +1,11 @@ import os +from pathlib import Path HOME = os.getenv('HOME') -DATA_DIR = os.path.join(HOME, 'data', 'tinysearch') + +DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch' +COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv' + HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') CRAWL_PREFIX = 'crawl_' CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*") @@ -11,6 +15,7 @@ TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv') WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz') + DOMAINS_QUEUE_NAME = 'domains-queue-fs' DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') diff --git a/poetry.lock b/poetry.lock index df81da1..b7b2398 100644 --- a/poetry.lock +++ b/poetry.lock @@ -221,6 +221,14 @@ category = "main" optional = false python-versions = ">=3.6" +[[package]] +name = "mmh3" +version = "3.0.0" +description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions." +category = "main" +optional = false +python-versions = "*" + [[package]] name = "murmurhash" version = "1.0.6" @@ -659,7 +667,7 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "d551f110c809c3c84dcd7061a00f8a2b6fb75bab5a7550fbf4bfe60d4300b37b" +content-hash = "8e573b5968296b81e95cfe0308ad10a5a5e2f80e2a9020a2478d61ae751c4d0c" [metadata.files] beautifulsoup4 = [ @@ -940,6 +948,34 @@ markupsafe = [ {file = "MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8"}, {file = "MarkupSafe-2.0.1.tar.gz", hash = "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a"}, ] +mmh3 = [ + {file = "mmh3-3.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:23912dde2ad4f701926948dd8e79a0e42b000f73962806f153931f52985e1e07"}, + {file = "mmh3-3.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:07f1308a410dc406d6a3c282a685728d00a87f3ed684f012671b96d6cc6a41c3"}, + {file = "mmh3-3.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:167cbc2b5ae27f3bccd797a2e8a9e7561791bee4cc2885f2c140eedc5df000ef"}, + {file = "mmh3-3.0.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:8fb833c2942917eff54f984b067d93e5a3c54dbb00720323460cdfed9292835f"}, + {file = "mmh3-3.0.0-cp36-cp36m-win32.whl", hash = "sha256:b7d26d0243ed9a5b8bf7aa8c53697cb79dff1e1d207f42396b7a7cb2a62298b7"}, + {file = "mmh3-3.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2b6c79fc314b34b911245b460a79b601fff39bb807521fb7ed7c15cacf0394ac"}, + {file = "mmh3-3.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6d0b3e9def1fdfe4eadd35ee26bf72bd715ba97711f7101302d54c9d2e70ba27"}, + {file = "mmh3-3.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:8803d28c17cf898f5f00c0433e8b13d51fa3bb4ebecf59872ba1eaa20d94128a"}, + {file = "mmh3-3.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:01e456edf9cc381298a590923aadd1c0bf9934d93433099a5001d656112437c2"}, + {file = "mmh3-3.0.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:ff69ddc2d46e3e42720840b6b4f7bfb032fd1e677fac347fdfff6e4d9fd01212"}, + {file = "mmh3-3.0.0-cp37-cp37m-win32.whl", hash = "sha256:e08a5d81a2ff53625953290187bed4ae96a6972e2b5cd5984a6ebc5a9aab256c"}, + {file = "mmh3-3.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:12484ac80373db77d8a6beb7615e7dac8b6c3fb118905311a51450b4fc4a24d1"}, + {file = "mmh3-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:93c96e657e9bf9e9ef12ddaeae9f109c0b3134146e2eff2cbddde5a34190920e"}, + {file = "mmh3-3.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:9097be65aa95460bc68b6108601da8894757532450daf74034e4eaecd536acca"}, + {file = "mmh3-3.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:19874e12acb4119ef1ef83062ef4ac953c3343dd07a67ede8fa096d0393f34be"}, + {file = "mmh3-3.0.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:4589adcb609d1547aac7c1ac1064eb27cdd44b65b7e8a114e2971cd3b7110306"}, + {file = "mmh3-3.0.0-cp38-cp38-win32.whl", hash = "sha256:7a311efd4ecf122f21392ec6bf447c620cc783d20bdb9aec60bb469a54318419"}, + {file = "mmh3-3.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:3566d1455fa4a09f8fb1aa5b37f68914949674f9aa2bd630e9fdf344207f55b5"}, + {file = "mmh3-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:92fdffd63edb67c30dbaba18a7448d762209c0e678b0c9d577d17b30362b59a3"}, + {file = "mmh3-3.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e52b869572c09db0c1a483f6e9cedbccfae8a282d95e552d3d4bd0712ab3196"}, + {file = "mmh3-3.0.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f1cce018cc82a8a6287e6aeb139e441129837b810f2ddf372e3ff7f0fefb0947"}, + {file = "mmh3-3.0.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:0fd09c4b61fcddbcf0a87d5463b4e6d2919896736a67efc5248d5c74c1c9c742"}, + {file = "mmh3-3.0.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c17fe2e276edd37ad8a6aff3b1663d3479c2c5c5993539c1050422a1dae33033"}, + {file = "mmh3-3.0.0-cp39-cp39-win32.whl", hash = "sha256:150439b906b4deaf6d796b2c2d11fb6159f08d02330d97723071ab3bf43b51df"}, + {file = "mmh3-3.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:bd870aedd9189eff1cf4e1687869b56c7e9461ee869789139c3e704009e5c227"}, + {file = "mmh3-3.0.0.tar.gz", hash = "sha256:d1ec578c09a07d3518ec9be540b87546397fa3455de73c166fcce51eaa5c41c5"}, +] murmurhash = [ {file = "murmurhash-1.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1431d817e1fff1ed35f8dc54dd5b4d70165ec98076de8aca351805f8037293f3"}, {file = "murmurhash-1.0.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c7b8cc4a8db1c821b80f8ca70a25c3166b14d68ecef8693a117c6a0b1d74ace"}, diff --git a/pyproject.toml b/pyproject.toml index 6c6320f..2aacbbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ pyspark = "^3.2.0" langdetect = "^1.0.9" zstandard = "^0.16.0" spacy = "^3.2.1" +mmh3 = "^3.0.0" [tool.poetry.dependencies.en_core_web_sm] url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"