Save results to gzip file
This commit is contained in:
parent
2562a5257a
commit
2d554b14e7
2 changed files with 13 additions and 6 deletions
|
@ -56,10 +56,10 @@ def run():
|
|||
WHERE crawl = 'CC-MAIN-2021-43'
|
||||
AND subset = 'warc'
|
||||
''')
|
||||
sqldf = sqldf.filter(col('url_host_name').isin(list(DOMAINS.keys())))
|
||||
print("Got rows", sqldf.take(10))
|
||||
print("Num rows", sqldf.count())
|
||||
sqldf = sqldf.sample(fraction=0.01)
|
||||
sqldf = sqldf.filter(col('url_host_name').isin(list(DOMAINS.keys())))
|
||||
# print("Got rows", sqldf.take(10))
|
||||
# print("Num rows", sqldf.count())
|
||||
sqldf.write.option('compression', 'gzip').format('json').mode('overwrite').save(RECORDS_PATH)
|
||||
|
||||
# warc_recs = sqldf.select("url", "warc_filename", "warc_record_offset", "warc_record_length").rdd
|
||||
|
|
|
@ -1,10 +1,15 @@
|
|||
import gzip
|
||||
import json
|
||||
import os
|
||||
from glob import glob
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
|
||||
from extract_process import fetch_process_warc_records
|
||||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
EXTRACTS_PATH = DATA_DIR / 'extracts'
|
||||
|
||||
ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
|
||||
|
||||
|
||||
|
@ -17,9 +22,11 @@ def get_records():
|
|||
|
||||
def run():
|
||||
records = get_records()
|
||||
processed = fetch_process_warc_records(islice(records, 10))
|
||||
for row in processed:
|
||||
print("Processed", row)
|
||||
processed = fetch_process_warc_records(islice(records, 50))
|
||||
with gzip.open(EXTRACTS_PATH / 'data.json.gz', 'wt') as output_file:
|
||||
for row in processed:
|
||||
output_file.write(json.dumps(row) + '\n')
|
||||
print("Processed", row)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue