Save results to gzip file

This commit is contained in:
Daoud Clarke 2021-12-07 22:10:16 +00:00
parent 2562a5257a
commit 2d554b14e7
2 changed files with 13 additions and 6 deletions

View file

@ -56,10 +56,10 @@ def run():
WHERE crawl = 'CC-MAIN-2021-43'
AND subset = 'warc'
''')
sqldf = sqldf.filter(col('url_host_name').isin(list(DOMAINS.keys())))
print("Got rows", sqldf.take(10))
print("Num rows", sqldf.count())
sqldf = sqldf.sample(fraction=0.01)
sqldf = sqldf.filter(col('url_host_name').isin(list(DOMAINS.keys())))
# print("Got rows", sqldf.take(10))
# print("Num rows", sqldf.count())
sqldf.write.option('compression', 'gzip').format('json').mode('overwrite').save(RECORDS_PATH)
# warc_recs = sqldf.select("url", "warc_filename", "warc_record_offset", "warc_record_length").rdd

View file

@ -1,10 +1,15 @@
import gzip
import json
import os
from glob import glob
from itertools import islice
from pathlib import Path
from extract_process import fetch_process_warc_records
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
EXTRACTS_PATH = DATA_DIR / 'extracts'
ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
@ -17,9 +22,11 @@ def get_records():
def run():
records = get_records()
processed = fetch_process_warc_records(islice(records, 10))
for row in processed:
print("Processed", row)
processed = fetch_process_warc_records(islice(records, 50))
with gzip.open(EXTRACTS_PATH / 'data.json.gz', 'wt') as output_file:
for row in processed:
output_file.write(json.dumps(row) + '\n')
print("Processed", row)
if __name__ == '__main__':