This commit is contained in:
Daoud Clarke 2023-01-24 22:52:58 +00:00
parent 77e39b4a89
commit 5783cee6b7
5 changed files with 16 additions and 22 deletions

Binary file not shown.

View file

@ -64,15 +64,6 @@ class URLDatabase:
)
"""
# index_sql = """
# CREATE INDEX IF NOT EXISTS host_index
# ON urls(substring(url FROM '.*://([^/]*)'), score)
# """
#
# view_sql = """
# CREATE OR REPLACE VIEW url_and_hosts AS SELECT *, substring(url FROM '.*://([^/]*)') AS host FROM urls
# """
with self.connection.cursor() as cursor:
cursor.execute(sql)
# cursor.execute(index_sql)
@ -133,7 +124,6 @@ class URLDatabase:
logger.info(f"Data: {len(data)}")
results = execute_values(cursor, insert_sql, data, fetch=True)
# results = cursor.fetchall()
logger.info(f"Results: {len(results)}")
updated = [FoundURL(*result) for result in results]
return updated

View file

@ -56,7 +56,11 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
url_statuses[item.url] = get_url_error_status(item)
else:
url_statuses[item.url] = URLStatus.CRAWLED
crawled_page_domain = get_domain(item.url)
try:
crawled_page_domain = get_domain(item.url)
except ValueError:
logger.info(f"Couldn't parse URL {item.url}")
continue
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
for link in item.content.links:
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,

View file

@ -24,7 +24,7 @@ MAX_URLS_PER_CORE_DOMAIN = 1000
MAX_URLS_PER_TOP_DOMAIN = 100
MAX_URLS_PER_OTHER_DOMAIN = 5
MAX_OTHER_DOMAINS = 10000
MIN_TOP_DOMAINS = 5
@dataclass
class URLScore:
@ -62,11 +62,6 @@ class URLQueue:
return num_processed
def _process_found_urls(self, found_urls: list[FoundURL]):
logger.info(f"Processing found URLs: {found_urls[:1000]}")
# with open(Path(os.environ["HOME"]) / "data" / "mwmbl" / "found-urls.pickle", "wb") as output_file:
# pickle.dump(found_urls, output_file)
# logger.info("Dumped")
min_updated_date = datetime.utcnow() - timedelta(hours=REASSIGN_MIN_HOURS)
logger.info(f"Found URLS: {len(found_urls)}")
@ -76,7 +71,7 @@ class URLQueue:
self._sort_urls(valid_urls)
logger.info(f"Queue size: {self.num_queued_batches}")
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) > 0:
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) > MIN_TOP_DOMAINS:
total_top_urls = sum(len(urls) for urls in self._top_urls.values())
logger.info(f"Total top URLs stored: {total_top_urls}")
@ -88,7 +83,10 @@ class URLQueue:
def _sort_urls(self, valid_urls: list[FoundURL]):
for found_url in valid_urls:
domain = get_domain(found_url.url)
try:
domain = get_domain(found_url.url)
except ValueError:
continue
url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls
url_store[domain].append(URLScore(found_url.url, found_url.score))
@ -99,7 +97,7 @@ class URLQueue:
# Keep only the top "other" domains, ranked by the top item for that domain
top_other_urls = sorted(self._other_urls.items(), key=lambda x: x[1][0].score, reverse=True)[:MAX_OTHER_DOMAINS]
self._other_urls = dict(top_other_urls)
self._other_urls = defaultdict(list, dict(top_other_urls))
def _batch_urls(self):
urls = []

View file

@ -13,5 +13,7 @@ def batch(items: list, batch_size):
def get_domain(url):
domain = DOMAIN_REGEX.search(url)[0]
return domain
results = DOMAIN_REGEX.match(url)
if results is None or len(results.groups()) == 0:
raise ValueError(f"Unable to parse domain from URL {url}")
return results.group(1)