Fix bugs
This commit is contained in:
parent
77e39b4a89
commit
5783cee6b7
5 changed files with 16 additions and 22 deletions
Binary file not shown.
|
@ -64,15 +64,6 @@ class URLDatabase:
|
|||
)
|
||||
"""
|
||||
|
||||
# index_sql = """
|
||||
# CREATE INDEX IF NOT EXISTS host_index
|
||||
# ON urls(substring(url FROM '.*://([^/]*)'), score)
|
||||
# """
|
||||
#
|
||||
# view_sql = """
|
||||
# CREATE OR REPLACE VIEW url_and_hosts AS SELECT *, substring(url FROM '.*://([^/]*)') AS host FROM urls
|
||||
# """
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql)
|
||||
# cursor.execute(index_sql)
|
||||
|
@ -133,7 +124,6 @@ class URLDatabase:
|
|||
|
||||
logger.info(f"Data: {len(data)}")
|
||||
results = execute_values(cursor, insert_sql, data, fetch=True)
|
||||
# results = cursor.fetchall()
|
||||
logger.info(f"Results: {len(results)}")
|
||||
updated = [FoundURL(*result) for result in results]
|
||||
return updated
|
||||
|
|
|
@ -56,7 +56,11 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
|
|||
url_statuses[item.url] = get_url_error_status(item)
|
||||
else:
|
||||
url_statuses[item.url] = URLStatus.CRAWLED
|
||||
crawled_page_domain = get_domain(item.url)
|
||||
try:
|
||||
crawled_page_domain = get_domain(item.url)
|
||||
except ValueError:
|
||||
logger.info(f"Couldn't parse URL {item.url}")
|
||||
continue
|
||||
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
||||
for link in item.content.links:
|
||||
process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
|
||||
|
|
|
@ -24,7 +24,7 @@ MAX_URLS_PER_CORE_DOMAIN = 1000
|
|||
MAX_URLS_PER_TOP_DOMAIN = 100
|
||||
MAX_URLS_PER_OTHER_DOMAIN = 5
|
||||
MAX_OTHER_DOMAINS = 10000
|
||||
|
||||
MIN_TOP_DOMAINS = 5
|
||||
|
||||
@dataclass
|
||||
class URLScore:
|
||||
|
@ -62,11 +62,6 @@ class URLQueue:
|
|||
return num_processed
|
||||
|
||||
def _process_found_urls(self, found_urls: list[FoundURL]):
|
||||
logger.info(f"Processing found URLs: {found_urls[:1000]}")
|
||||
# with open(Path(os.environ["HOME"]) / "data" / "mwmbl" / "found-urls.pickle", "wb") as output_file:
|
||||
# pickle.dump(found_urls, output_file)
|
||||
# logger.info("Dumped")
|
||||
|
||||
min_updated_date = datetime.utcnow() - timedelta(hours=REASSIGN_MIN_HOURS)
|
||||
|
||||
logger.info(f"Found URLS: {len(found_urls)}")
|
||||
|
@ -76,7 +71,7 @@ class URLQueue:
|
|||
|
||||
self._sort_urls(valid_urls)
|
||||
logger.info(f"Queue size: {self.num_queued_batches}")
|
||||
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) > 0:
|
||||
while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) > MIN_TOP_DOMAINS:
|
||||
total_top_urls = sum(len(urls) for urls in self._top_urls.values())
|
||||
logger.info(f"Total top URLs stored: {total_top_urls}")
|
||||
|
||||
|
@ -88,7 +83,10 @@ class URLQueue:
|
|||
|
||||
def _sort_urls(self, valid_urls: list[FoundURL]):
|
||||
for found_url in valid_urls:
|
||||
domain = get_domain(found_url.url)
|
||||
try:
|
||||
domain = get_domain(found_url.url)
|
||||
except ValueError:
|
||||
continue
|
||||
url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls
|
||||
url_store[domain].append(URLScore(found_url.url, found_url.score))
|
||||
|
||||
|
@ -99,7 +97,7 @@ class URLQueue:
|
|||
|
||||
# Keep only the top "other" domains, ranked by the top item for that domain
|
||||
top_other_urls = sorted(self._other_urls.items(), key=lambda x: x[1][0].score, reverse=True)[:MAX_OTHER_DOMAINS]
|
||||
self._other_urls = dict(top_other_urls)
|
||||
self._other_urls = defaultdict(list, dict(top_other_urls))
|
||||
|
||||
def _batch_urls(self):
|
||||
urls = []
|
||||
|
|
|
@ -13,5 +13,7 @@ def batch(items: list, batch_size):
|
|||
|
||||
|
||||
def get_domain(url):
|
||||
domain = DOMAIN_REGEX.search(url)[0]
|
||||
return domain
|
||||
results = DOMAIN_REGEX.match(url)
|
||||
if results is None or len(results.groups()) == 0:
|
||||
raise ValueError(f"Unable to parse domain from URL {url}")
|
||||
return results.group(1)
|
||||
|
|
Loading…
Reference in a new issue