Index common crawl data
This commit is contained in:
parent
65b366d30d
commit
2844c1df75
5 changed files with 65 additions and 33 deletions
|
@ -3,14 +3,10 @@ import json
|
|||
import os
|
||||
from glob import glob
|
||||
from multiprocessing import Process, Lock
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
from extract_process import fetch_process_warc_records
|
||||
from fsqueue import FSQueue, GzipJsonRowSerializer
|
||||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
EXTRACTS_PATH = DATA_DIR / 'extracts'
|
||||
from paths import DATA_DIR
|
||||
|
||||
ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
|
||||
|
||||
|
|
46
indexcc.py
46
indexcc.py
|
@ -1,38 +1,32 @@
|
|||
"""
|
||||
Index Wikipedia
|
||||
Index data downloaded from Common Crawl
|
||||
"""
|
||||
import gzip
|
||||
import html
|
||||
from urllib.parse import quote
|
||||
|
||||
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
|
||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
import spacy
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
TITLE_START = '<title>Wikipedia: '
|
||||
TITLE_END = '</title>\n'
|
||||
from fsqueue import FSQueue, GzipJsonRowSerializer
|
||||
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES, Document
|
||||
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
|
||||
|
||||
|
||||
def index_wiki():
|
||||
def index_common_craw_data():
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
||||
|
||||
with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_common_crawl_titles_and_urls()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls, COMMON_CRAWL_TERMS_PATH)
|
||||
|
||||
|
||||
def get_wiki_titles_and_urls():
|
||||
start_len = len(TITLE_START)
|
||||
end_len = len(TITLE_END)
|
||||
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
|
||||
wiki_titles_file.readline()
|
||||
for raw_title in wiki_titles_file:
|
||||
assert raw_title.startswith(TITLE_START)
|
||||
assert raw_title.endswith(TITLE_END)
|
||||
title = raw_title[start_len:-end_len]
|
||||
unescaped_title = html.unescape(title)
|
||||
url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
|
||||
yield unescaped_title, url
|
||||
def get_common_crawl_titles_and_urls():
|
||||
input_queue = FSQueue(DATA_DIR, 'search-items', GzipJsonRowSerializer())
|
||||
while True:
|
||||
next_item = input_queue.get()
|
||||
if next_item is None:
|
||||
break
|
||||
item_id, items = next_item
|
||||
for url, title, extract in items:
|
||||
yield title, url
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
index_wiki()
|
||||
index_common_craw_data()
|
||||
|
|
7
paths.py
7
paths.py
|
@ -1,7 +1,11 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
HOME = os.getenv('HOME')
|
||||
DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
|
||||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
|
||||
|
||||
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
||||
CRAWL_PREFIX = 'crawl_'
|
||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||
|
@ -11,6 +15,7 @@ TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
|
|||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
||||
|
||||
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
|
||||
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
|
||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
|
|
38
poetry.lock
generated
38
poetry.lock
generated
|
@ -221,6 +221,14 @@ category = "main"
|
|||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
|
||||
[[package]]
|
||||
name = "mmh3"
|
||||
version = "3.0.0"
|
||||
description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "murmurhash"
|
||||
version = "1.0.6"
|
||||
|
@ -659,7 +667,7 @@ cffi = ["cffi (>=1.11)"]
|
|||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "d551f110c809c3c84dcd7061a00f8a2b6fb75bab5a7550fbf4bfe60d4300b37b"
|
||||
content-hash = "8e573b5968296b81e95cfe0308ad10a5a5e2f80e2a9020a2478d61ae751c4d0c"
|
||||
|
||||
[metadata.files]
|
||||
beautifulsoup4 = [
|
||||
|
@ -940,6 +948,34 @@ markupsafe = [
|
|||
{file = "MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8"},
|
||||
{file = "MarkupSafe-2.0.1.tar.gz", hash = "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a"},
|
||||
]
|
||||
mmh3 = [
|
||||
{file = "mmh3-3.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:23912dde2ad4f701926948dd8e79a0e42b000f73962806f153931f52985e1e07"},
|
||||
{file = "mmh3-3.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:07f1308a410dc406d6a3c282a685728d00a87f3ed684f012671b96d6cc6a41c3"},
|
||||
{file = "mmh3-3.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:167cbc2b5ae27f3bccd797a2e8a9e7561791bee4cc2885f2c140eedc5df000ef"},
|
||||
{file = "mmh3-3.0.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:8fb833c2942917eff54f984b067d93e5a3c54dbb00720323460cdfed9292835f"},
|
||||
{file = "mmh3-3.0.0-cp36-cp36m-win32.whl", hash = "sha256:b7d26d0243ed9a5b8bf7aa8c53697cb79dff1e1d207f42396b7a7cb2a62298b7"},
|
||||
{file = "mmh3-3.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2b6c79fc314b34b911245b460a79b601fff39bb807521fb7ed7c15cacf0394ac"},
|
||||
{file = "mmh3-3.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6d0b3e9def1fdfe4eadd35ee26bf72bd715ba97711f7101302d54c9d2e70ba27"},
|
||||
{file = "mmh3-3.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:8803d28c17cf898f5f00c0433e8b13d51fa3bb4ebecf59872ba1eaa20d94128a"},
|
||||
{file = "mmh3-3.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:01e456edf9cc381298a590923aadd1c0bf9934d93433099a5001d656112437c2"},
|
||||
{file = "mmh3-3.0.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:ff69ddc2d46e3e42720840b6b4f7bfb032fd1e677fac347fdfff6e4d9fd01212"},
|
||||
{file = "mmh3-3.0.0-cp37-cp37m-win32.whl", hash = "sha256:e08a5d81a2ff53625953290187bed4ae96a6972e2b5cd5984a6ebc5a9aab256c"},
|
||||
{file = "mmh3-3.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:12484ac80373db77d8a6beb7615e7dac8b6c3fb118905311a51450b4fc4a24d1"},
|
||||
{file = "mmh3-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:93c96e657e9bf9e9ef12ddaeae9f109c0b3134146e2eff2cbddde5a34190920e"},
|
||||
{file = "mmh3-3.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:9097be65aa95460bc68b6108601da8894757532450daf74034e4eaecd536acca"},
|
||||
{file = "mmh3-3.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:19874e12acb4119ef1ef83062ef4ac953c3343dd07a67ede8fa096d0393f34be"},
|
||||
{file = "mmh3-3.0.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:4589adcb609d1547aac7c1ac1064eb27cdd44b65b7e8a114e2971cd3b7110306"},
|
||||
{file = "mmh3-3.0.0-cp38-cp38-win32.whl", hash = "sha256:7a311efd4ecf122f21392ec6bf447c620cc783d20bdb9aec60bb469a54318419"},
|
||||
{file = "mmh3-3.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:3566d1455fa4a09f8fb1aa5b37f68914949674f9aa2bd630e9fdf344207f55b5"},
|
||||
{file = "mmh3-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:92fdffd63edb67c30dbaba18a7448d762209c0e678b0c9d577d17b30362b59a3"},
|
||||
{file = "mmh3-3.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e52b869572c09db0c1a483f6e9cedbccfae8a282d95e552d3d4bd0712ab3196"},
|
||||
{file = "mmh3-3.0.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f1cce018cc82a8a6287e6aeb139e441129837b810f2ddf372e3ff7f0fefb0947"},
|
||||
{file = "mmh3-3.0.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:0fd09c4b61fcddbcf0a87d5463b4e6d2919896736a67efc5248d5c74c1c9c742"},
|
||||
{file = "mmh3-3.0.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c17fe2e276edd37ad8a6aff3b1663d3479c2c5c5993539c1050422a1dae33033"},
|
||||
{file = "mmh3-3.0.0-cp39-cp39-win32.whl", hash = "sha256:150439b906b4deaf6d796b2c2d11fb6159f08d02330d97723071ab3bf43b51df"},
|
||||
{file = "mmh3-3.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:bd870aedd9189eff1cf4e1687869b56c7e9461ee869789139c3e704009e5c227"},
|
||||
{file = "mmh3-3.0.0.tar.gz", hash = "sha256:d1ec578c09a07d3518ec9be540b87546397fa3455de73c166fcce51eaa5c41c5"},
|
||||
]
|
||||
murmurhash = [
|
||||
{file = "murmurhash-1.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1431d817e1fff1ed35f8dc54dd5b4d70165ec98076de8aca351805f8037293f3"},
|
||||
{file = "murmurhash-1.0.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c7b8cc4a8db1c821b80f8ca70a25c3166b14d68ecef8693a117c6a0b1d74ace"},
|
||||
|
|
|
@ -19,6 +19,7 @@ pyspark = "^3.2.0"
|
|||
langdetect = "^1.0.9"
|
||||
zstandard = "^0.16.0"
|
||||
spacy = "^3.2.1"
|
||||
mmh3 = "^3.0.0"
|
||||
|
||||
[tool.poetry.dependencies.en_core_web_sm]
|
||||
url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
|
||||
|
|
Loading…
Reference in a new issue