Export URLs to sqlite for evaluation purposes

This commit is contained in:
Daoud Clarke 2022-01-02 20:06:13 +00:00
parent ae7312c32a
commit 25918e42ef
3 changed files with 54 additions and 8 deletions

41
analyse/export_urls.py Normal file
View file

@ -0,0 +1,41 @@
"""
Export the list of unique URLs to a SQLite file for analysis/evaluation.
"""
import sqlite3
from mwmbl.indexer.paths import URLS_PATH
from mwmbl.tinysearchengine.app import get_config_and_index
def create_database():
with sqlite3.connect(URLS_PATH) as connection:
connection.execute("""
CREATE TABLE urls (url TEXT PRIMARY KEY)
""")
def get_url_batches():
config, index = get_config_and_index()
for page_num in range(config.index_config.num_pages):
if page_num % 1000 == 0:
print("Processing page", page_num)
page = index.get_page(page_num)
if page is None:
continue
yield [url for title, url, extract in page]
def run():
create_database()
url_batches = get_url_batches()
with sqlite3.connect(URLS_PATH) as connection:
for url_batch in url_batches:
parameters = [(url,) for url in url_batch]
connection.executemany("""
INSERT OR IGNORE INTO urls VALUES (?)
""", parameters)
if __name__ == '__main__':
run()

View file

@ -14,6 +14,7 @@ TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
URLS_PATH = DATA_DIR / 'urls.sqlite3'
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')

View file

@ -26,14 +26,7 @@ def main():
* Initialize a FastAPI app instance
* Starts uvicorn server using app instance
"""
args = setup_args()
config = parse_config_file(config_filename=args.config)
# Initialize TinyIndex using index config params
tiny_index = TinyIndex(
item_factory=Document,
**config.index_config.dict()
)
config, tiny_index = get_config_and_index()
# Initialize FastApi instance
app = create_app.create(tiny_index)
@ -42,5 +35,16 @@ def main():
uvicorn.run(app, **config.server_config.dict())
def get_config_and_index():
args = setup_args()
config = parse_config_file(config_filename=args.config)
# Initialize TinyIndex using index config params
tiny_index = TinyIndex(
item_factory=Document,
**config.index_config.dict()
)
return config, tiny_index
if __name__ == "__main__":
main()