From 2d554b14e7b9a7d9acc9bb118171e469276110e3 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Tue, 7 Dec 2021 22:10:16 +0000 Subject: [PATCH] Save results to gzip file --- extract.py | 6 +++--- extract_local.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/extract.py b/extract.py index 24aff83..8c3a70d 100644 --- a/extract.py +++ b/extract.py @@ -56,10 +56,10 @@ def run(): WHERE crawl = 'CC-MAIN-2021-43' AND subset = 'warc' ''') - sqldf = sqldf.filter(col('url_host_name').isin(list(DOMAINS.keys()))) - print("Got rows", sqldf.take(10)) - print("Num rows", sqldf.count()) sqldf = sqldf.sample(fraction=0.01) + sqldf = sqldf.filter(col('url_host_name').isin(list(DOMAINS.keys()))) + # print("Got rows", sqldf.take(10)) + # print("Num rows", sqldf.count()) sqldf.write.option('compression', 'gzip').format('json').mode('overwrite').save(RECORDS_PATH) # warc_recs = sqldf.select("url", "warc_filename", "warc_record_offset", "warc_record_length").rdd diff --git a/extract_local.py b/extract_local.py index ea4bc01..5064dbd 100644 --- a/extract_local.py +++ b/extract_local.py @@ -1,10 +1,15 @@ import gzip import json +import os from glob import glob from itertools import islice +from pathlib import Path from extract_process import fetch_process_warc_records +DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch' +EXTRACTS_PATH = DATA_DIR / 'extracts' + ARCHIVE_INFO_GLOB = 'outputs/records/*.gz' @@ -17,9 +22,11 @@ def get_records(): def run(): records = get_records() - processed = fetch_process_warc_records(islice(records, 10)) - for row in processed: - print("Processed", row) + processed = fetch_process_warc_records(islice(records, 50)) + with gzip.open(EXTRACTS_PATH / 'data.json.gz', 'wt') as output_file: + for row in processed: + output_file.write(json.dumps(row) + '\n') + print("Processed", row) if __name__ == '__main__':