Analyse crawled URLs and domains
This commit is contained in:
parent
171fa645d2
commit
70254ae160
2 changed files with 47 additions and 0 deletions
42
analyse/analyse_crawled_domains.py
Normal file
42
analyse/analyse_crawled_domains.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
See how many unique URLs and root domains we have crawled.
|
||||
"""
|
||||
import glob
|
||||
import gzip
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
|
||||
|
||||
|
||||
def get_urls():
|
||||
for path in glob.glob(CRAWL_GLOB):
|
||||
data = json.load(gzip.open(path))
|
||||
for item in data['items']:
|
||||
yield item['url']
|
||||
|
||||
|
||||
def analyse_urls(urls):
|
||||
url_set = set()
|
||||
domains = set()
|
||||
count = 0
|
||||
for url in urls:
|
||||
count += 1
|
||||
url_set.add(url)
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path.strip('/')
|
||||
if path == '':
|
||||
domains.add(parsed_url.netloc)
|
||||
|
||||
print("Root pages crawled", sorted(domains))
|
||||
print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
|
||||
|
||||
|
||||
def run():
|
||||
urls = get_urls()
|
||||
analyse_urls(urls)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
5
mwmbl/indexer/index_crawl.py
Normal file
5
mwmbl/indexer/index_crawl.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
"""
|
||||
Index data crawled through the Mwmbl crawler.
|
||||
"""
|
||||
|
||||
|
Loading…
Add table
Reference in a new issue