瀏覽代碼

Store computed link counts

Daoud Clarke 3 年之前
父節點
當前提交
b5b2005323

+ 1 - 1
analyse/analyse_crawled_domains.py

@@ -7,7 +7,7 @@ import json
 from collections import defaultdict, Counter
 from collections import defaultdict, Counter
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 
 
-CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
+from mwmbl.indexer.paths import CRAWL_GLOB
 
 
 
 
 def get_urls():
 def get_urls():

+ 2 - 2
analyse/make_curl.py

@@ -5,11 +5,11 @@ import os
 from itertools import islice
 from itertools import islice
 from urllib.parse import quote
 from urllib.parse import quote
 
 
-from mwmbl.indexer.paths import DATA_DIR
+from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR
 from mwmbl.indexer.wiki import get_wiki_titles_and_urls
 from mwmbl.indexer.wiki import get_wiki_titles_and_urls
 
 
 URL_TEMPLATE = "http://localhost:8000/complete?q={}"
 URL_TEMPLATE = "http://localhost:8000/complete?q={}"
-CURL_FILE = os.path.join(DATA_DIR, "urls.curl")
+CURL_FILE = os.path.join(TINYSEARCH_DATA_DIR, "urls.curl")
 
 
 
 
 def get_urls():
 def get_urls():

+ 2 - 2
analyse/performance.py

@@ -12,13 +12,13 @@ from mwmbl.tinysearchengine import create_app
 from mwmbl.indexer.fsqueue import ZstdJsonSerializer
 from mwmbl.indexer.fsqueue import ZstdJsonSerializer
 from mwmbl.indexer.index import index_titles_urls_and_extracts
 from mwmbl.indexer.index import index_titles_urls_and_extracts
 from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
 from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
-from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
+from mwmbl.indexer.paths import TEST_INDEX_PATH, TINYSEARCH_DATA_DIR, TEST_TERMS_PATH
 
 
 NUM_DOCUMENTS = 30000
 NUM_DOCUMENTS = 30000
 NUM_PAGES_FOR_STATS = 10
 NUM_PAGES_FOR_STATS = 10
 TEST_PAGE_SIZE = 512
 TEST_PAGE_SIZE = 512
 TEST_NUM_PAGES = 1024
 TEST_NUM_PAGES = 1024
-TEST_DATA_PATH = os.path.join(DATA_DIR, 'test-urls.zstd')
+TEST_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'test-urls.zstd')
 RECALL_AT_K = 3
 RECALL_AT_K = 3
 
 
 NUM_QUERY_CHARS = 10
 NUM_QUERY_CHARS = 10

+ 3 - 3
mwmbl/indexer/domains/domain_titles.py

@@ -9,7 +9,7 @@ import bs4
 import requests
 import requests
 
 
 from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
 from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
-from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
+from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
 
 
 NUM_PROCESSES = 10
 NUM_PROCESSES = 10
 
 
@@ -33,8 +33,8 @@ def get_redirect_no_cookies(url, max_redirects=5):
 
 
 
 
 def get_domain_titles():
 def get_domain_titles():
-    domains_queue = FSQueue(DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
-    titles_queue = FSQueue(DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer())
+    domains_queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
+    titles_queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer())
     while True:
     while True:
         items_id, items = domains_queue.get()
         items_id, items = domains_queue.get()
         titles = retrieve_titles(items)
         titles = retrieve_titles(items)

+ 2 - 2
mwmbl/indexer/domains/queue_domains.py

@@ -5,7 +5,7 @@ import csv
 import gzip
 import gzip
 
 
 from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
 from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
-from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
+from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, TINYSEARCH_DATA_DIR
 
 
 BATCH_SIZE = 250
 BATCH_SIZE = 250
 
 
@@ -18,7 +18,7 @@ def get_domains():
 
 
 
 
 def queue_domains():
 def queue_domains():
-    queue = FSQueue(DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
+    queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer())
     queued = 0
     queued = 0
     batch = []
     batch = []
     for rank, domain in get_domains():
     for rank, domain in get_domains():

+ 2 - 2
mwmbl/indexer/index_crawl.py

@@ -7,7 +7,7 @@ import spacy
 
 
 from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
 from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
 from mwmbl.indexer.index import index_titles_urls_and_extracts
 from mwmbl.indexer.index import index_titles_urls_and_extracts
-from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
+from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 
 
 
 
@@ -25,7 +25,7 @@ def index_mwmbl_crawl_data():
 
 
 
 
 def get_mwmbl_crawl_titles_urls_and_extracts():
 def get_mwmbl_crawl_titles_urls_and_extracts():
-    input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
+    input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
     input_queue.unlock_all()
     input_queue.unlock_all()
     while True:
     while True:
         try:
         try:

+ 4 - 4
mwmbl/indexer/links.py

@@ -7,7 +7,7 @@ import json
 from collections import defaultdict
 from collections import defaultdict
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 
 
-from analyse.analyse_crawled_domains import CRAWL_GLOB
+from mwmbl.indexer.paths import CRAWL_GLOB, LINK_COUNT_PATH
 
 
 
 
 def get_urls():
 def get_urls():
@@ -30,9 +30,9 @@ def collect_links(urls):
 def run():
 def run():
     url_links = get_urls()
     url_links = get_urls()
     collected = collect_links(url_links)
     collected = collect_links(url_links)
-    top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
-    for url, items in top:
-        print("URL", url, len(items))
+    link_counts = {url: len(links) for url, links in collected.items()}
+    with open(LINK_COUNT_PATH, 'w') as output_file:
+        json.dump(link_counts, output_file, indent=2)
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':

+ 14 - 9
mwmbl/indexer/paths.py

@@ -3,21 +3,26 @@ from pathlib import Path
 
 
 HOME = os.getenv('HOME')
 HOME = os.getenv('HOME')
 
 
-DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
-COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
+DATA_DIR = Path(os.environ['HOME']) / 'data'
+TINYSEARCH_DATA_DIR = DATA_DIR / 'tinysearch'
+COMMON_CRAWL_TERMS_PATH = TINYSEARCH_DATA_DIR / 'common-craw-terms.csv'
 
 
-TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
-TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
-WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
-WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
+TEST_INDEX_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'index-test.tinysearch')
+TEST_TERMS_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'index-terms.csv')
+WIKI_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
+WIKI_TITLES_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'abstract-titles-sorted.txt.gz')
 
 
-URLS_PATH = DATA_DIR / 'urls.sqlite3'
+URLS_PATH = TINYSEARCH_DATA_DIR / 'urls.sqlite3'
 DOMAINS_QUEUE_NAME = 'domains-queue-fs'
 DOMAINS_QUEUE_NAME = 'domains-queue-fs'
 DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
 DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
-DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
+DOMAINS_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'top10milliondomains.csv.gz')
 
 
 LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data'
 LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data'
 INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch'
 INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch'
 MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv'
 MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv'
 
 
-TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'
+TOP_DOMAINS_JSON_PATH = TINYSEARCH_DATA_DIR / 'hn-top-domains.json'
+
+MWMBL_DATA_DIR = DATA_DIR / "mwmbl"
+CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/*/*/*/*.json.gz"
+LINK_COUNT_PATH = MWMBL_DATA_DIR / 'crawl-counts.json'