Store computed link counts
This commit is contained in:
parent
00d18c3474
commit
b5b2005323
8 changed files with 30 additions and 25 deletions
|
@ -7,7 +7,7 @@ import json
|
|||
from collections import defaultdict, Counter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
|
||||
from mwmbl.indexer.paths import CRAWL_GLOB
|
||||
|
||||
|
||||
def get_urls():
|
||||
|
|
|
@ -5,11 +5,11 @@ import os
|
|||
from itertools import islice
|
||||
from urllib.parse import quote
|
||||
|
||||
from mwmbl.indexer.paths import DATA_DIR
|
||||
from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR
|
||||
from mwmbl.indexer.wiki import get_wiki_titles_and_urls
|
||||
|
||||
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
|
||||
CURL_FILE = os.path.join(DATA_DIR, "urls.curl")
|
||||
CURL_FILE = os.path.join(TINYSEARCH_DATA_DIR, "urls.curl")
|
||||
|
||||
|
||||
def get_urls():
|
||||
|
|
|
@ -12,13 +12,13 @@ from mwmbl.tinysearchengine import create_app
|
|||
from mwmbl.indexer.fsqueue import ZstdJsonSerializer
|
||||
from mwmbl.indexer.index import index_titles_urls_and_extracts
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||
from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
from mwmbl.indexer.paths import TEST_INDEX_PATH, TINYSEARCH_DATA_DIR, TEST_TERMS_PATH
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
NUM_PAGES_FOR_STATS = 10
|
||||
TEST_PAGE_SIZE = 512
|
||||
TEST_NUM_PAGES = 1024
|
||||
TEST_DATA_PATH = os.path.join(DATA_DIR, 'test-urls.zstd')
|
||||
TEST_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'test-urls.zstd')
|
||||
RECALL_AT_K = 3
|
||||
|
||||
NUM_QUERY_CHARS = 10
|
||||
|
|
|
@ -9,7 +9,7 @@ import bs4
|
|||
import requests
|
||||
|
||||
from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
||||
from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
||||
|
||||
NUM_PROCESSES = 10
|
||||
|
||||
|
@ -33,8 +33,8 @@ def get_redirect_no_cookies(url, max_redirects=5):
|
|||
|
||||
|
||||
def get_domain_titles():
|
||||
domains_queue = FSQueue(DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
|
||||
titles_queue = FSQueue(DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer())
|
||||
domains_queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
|
||||
titles_queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer())
|
||||
while True:
|
||||
items_id, items = domains_queue.get()
|
||||
titles = retrieve_titles(items)
|
||||
|
|
|
@ -5,7 +5,7 @@ import csv
|
|||
import gzip
|
||||
|
||||
from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
|
||||
from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, TINYSEARCH_DATA_DIR
|
||||
|
||||
BATCH_SIZE = 250
|
||||
|
||||
|
@ -18,7 +18,7 @@ def get_domains():
|
|||
|
||||
|
||||
def queue_domains():
|
||||
queue = FSQueue(DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
|
||||
queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
|
||||
queued = 0
|
||||
batch = []
|
||||
for rank, domain in get_domains():
|
||||
|
|
|
@ -7,7 +7,7 @@ import spacy
|
|||
|
||||
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
|
||||
from mwmbl.indexer.index import index_titles_urls_and_extracts
|
||||
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
|
||||
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
||||
|
||||
|
||||
|
@ -25,7 +25,7 @@ def index_mwmbl_crawl_data():
|
|||
|
||||
|
||||
def get_mwmbl_crawl_titles_urls_and_extracts():
|
||||
input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
|
||||
input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
|
||||
input_queue.unlock_all()
|
||||
while True:
|
||||
try:
|
||||
|
|
|
@ -7,7 +7,7 @@ import json
|
|||
from collections import defaultdict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from analyse.analyse_crawled_domains import CRAWL_GLOB
|
||||
from mwmbl.indexer.paths import CRAWL_GLOB, LINK_COUNT_PATH
|
||||
|
||||
|
||||
def get_urls():
|
||||
|
@ -30,9 +30,9 @@ def collect_links(urls):
|
|||
def run():
|
||||
url_links = get_urls()
|
||||
collected = collect_links(url_links)
|
||||
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
|
||||
for url, items in top:
|
||||
print("URL", url, len(items))
|
||||
link_counts = {url: len(links) for url, links in collected.items()}
|
||||
with open(LINK_COUNT_PATH, 'w') as output_file:
|
||||
json.dump(link_counts, output_file, indent=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -3,21 +3,26 @@ from pathlib import Path
|
|||
|
||||
HOME = os.getenv('HOME')
|
||||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data'
|
||||
TINYSEARCH_DATA_DIR = DATA_DIR / 'tinysearch'
|
||||
COMMON_CRAWL_TERMS_PATH = TINYSEARCH_DATA_DIR / 'common-craw-terms.csv'
|
||||
|
||||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
|
||||
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
TEST_INDEX_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'index-test.tinysearch')
|
||||
TEST_TERMS_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'index-terms.csv')
|
||||
WIKI_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
||||
URLS_PATH = DATA_DIR / 'urls.sqlite3'
|
||||
URLS_PATH = TINYSEARCH_DATA_DIR / 'urls.sqlite3'
|
||||
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
|
||||
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
|
||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
DOMAINS_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
|
||||
LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data'
|
||||
INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch'
|
||||
MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv'
|
||||
|
||||
TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'
|
||||
TOP_DOMAINS_JSON_PATH = TINYSEARCH_DATA_DIR / 'hn-top-domains.json'
|
||||
|
||||
MWMBL_DATA_DIR = DATA_DIR / "mwmbl"
|
||||
CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/*/*/*/*.json.gz"
|
||||
LINK_COUNT_PATH = MWMBL_DATA_DIR / 'crawl-counts.json'
|
||||
|
|
Loading…
Add table
Reference in a new issue