Export URLs to sqlite for evaluation purposes
This commit is contained in:
parent
ae7312c32a
commit
25918e42ef
3 changed files with 54 additions and 8 deletions
41
analyse/export_urls.py
Normal file
41
analyse/export_urls.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
"""
|
||||
Export the list of unique URLs to a SQLite file for analysis/evaluation.
|
||||
"""
|
||||
import sqlite3
|
||||
|
||||
from mwmbl.indexer.paths import URLS_PATH
|
||||
from mwmbl.tinysearchengine.app import get_config_and_index
|
||||
|
||||
|
||||
def create_database():
|
||||
with sqlite3.connect(URLS_PATH) as connection:
|
||||
connection.execute("""
|
||||
CREATE TABLE urls (url TEXT PRIMARY KEY)
|
||||
""")
|
||||
|
||||
|
||||
def get_url_batches():
|
||||
config, index = get_config_and_index()
|
||||
for page_num in range(config.index_config.num_pages):
|
||||
if page_num % 1000 == 0:
|
||||
print("Processing page", page_num)
|
||||
page = index.get_page(page_num)
|
||||
if page is None:
|
||||
continue
|
||||
yield [url for title, url, extract in page]
|
||||
|
||||
|
||||
def run():
|
||||
create_database()
|
||||
url_batches = get_url_batches()
|
||||
|
||||
with sqlite3.connect(URLS_PATH) as connection:
|
||||
for url_batch in url_batches:
|
||||
parameters = [(url,) for url in url_batch]
|
||||
connection.executemany("""
|
||||
INSERT OR IGNORE INTO urls VALUES (?)
|
||||
""", parameters)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
|
@ -14,6 +14,7 @@ TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
|
|||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
||||
URLS_PATH = DATA_DIR / 'urls.sqlite3'
|
||||
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
|
||||
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
|
||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
|
|
|
@ -26,14 +26,7 @@ def main():
|
|||
* Initialize a FastAPI app instance
|
||||
* Starts uvicorn server using app instance
|
||||
"""
|
||||
args = setup_args()
|
||||
config = parse_config_file(config_filename=args.config)
|
||||
|
||||
# Initialize TinyIndex using index config params
|
||||
tiny_index = TinyIndex(
|
||||
item_factory=Document,
|
||||
**config.index_config.dict()
|
||||
)
|
||||
config, tiny_index = get_config_and_index()
|
||||
|
||||
# Initialize FastApi instance
|
||||
app = create_app.create(tiny_index)
|
||||
|
@ -42,5 +35,16 @@ def main():
|
|||
uvicorn.run(app, **config.server_config.dict())
|
||||
|
||||
|
||||
def get_config_and_index():
|
||||
args = setup_args()
|
||||
config = parse_config_file(config_filename=args.config)
|
||||
# Initialize TinyIndex using index config params
|
||||
tiny_index = TinyIndex(
|
||||
item_factory=Document,
|
||||
**config.index_config.dict()
|
||||
)
|
||||
return config, tiny_index
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
Loading…
Reference in a new issue