index.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. """
  2. Create a search index
  3. """
  4. import gzip
  5. import sqlite3
  6. from dataclasses import dataclass
  7. from glob import glob
  8. from itertools import chain, count, islice
  9. from typing import List, Iterator
  10. from urllib.parse import unquote
  11. import bs4
  12. import justext
  13. from spacy.lang.en import English
  14. from paths import CRAWL_GLOB, INDEX_PATH
  15. NUM_INITIAL_TOKENS = 50
  16. HTTP_START = 'http://'
  17. HTTPS_START = 'https://'
  18. BATCH_SIZE = 10000
  19. def is_content_token(nlp, token):
  20. lexeme = nlp.vocab[token.orth]
  21. return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
  22. def tokenize(nlp, cleaned_text):
  23. tokens = nlp.tokenizer(cleaned_text)
  24. content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
  25. if is_content_token(nlp, token)]
  26. lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
  27. return lowered
  28. def clean(content):
  29. text = justext.justext(content, justext.get_stoplist("English"))
  30. pars = [par.text for par in text if not par.is_boilerplate]
  31. cleaned_text = ' '.join(pars)
  32. return cleaned_text
  33. @dataclass
  34. class Page:
  35. tokens: List[str]
  36. url: str
  37. title: str
  38. class Indexer:
  39. def __init__(self, index_path):
  40. self.index_path = index_path
  41. def index(self, pages: List[Page]):
  42. with sqlite3.connect(self.index_path) as con:
  43. cursor = con.execute("""
  44. SELECT max(id) FROM pages
  45. """)
  46. current_id = cursor.fetchone()[0]
  47. if current_id is None:
  48. first_page_id = 1
  49. else:
  50. first_page_id = current_id + 1
  51. page_ids = range(first_page_id, first_page_id + len(pages))
  52. urls_titles_ids = ((page.url, page.title, page_id)
  53. for page, page_id in zip(pages, page_ids))
  54. con.executemany("""
  55. INSERT INTO pages (url, title, id)
  56. VALUES (?, ?, ?)
  57. """, urls_titles_ids)
  58. tokens = chain(*([(term, page_id) for term in page.tokens]
  59. for page, page_id in zip(pages, page_ids)))
  60. con.executemany("""
  61. INSERT INTO terms (term, page_id)
  62. VALUES (?, ?)
  63. """, tokens)
  64. def create_if_not_exists(self):
  65. con = sqlite3.connect(self.index_path)
  66. con.execute("""
  67. CREATE TABLE IF NOT EXISTS pages (
  68. id INTEGER PRIMARY KEY,
  69. url TEXT UNIQUE,
  70. title TEXT
  71. )
  72. """)
  73. con.execute("""
  74. CREATE TABLE IF NOT EXISTS terms (
  75. term TEXT,
  76. page_id INTEGER
  77. )
  78. """)
  79. con.execute("""
  80. CREATE INDEX IF NOT EXISTS term_index ON terms (term)
  81. """)
  82. def page_indexed(self, url):
  83. con = sqlite3.connect(self.index_path)
  84. result = con.execute("""
  85. SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
  86. """, (url,))
  87. value = result.fetchone()[0]
  88. return value == 1
  89. def get_num_tokens(self):
  90. con = sqlite3.connect(self.index_path)
  91. cursor = con.execute("""
  92. SELECT count(*) from terms
  93. """)
  94. num_terms = cursor.fetchone()[0]
  95. return num_terms
  96. def get_random_terms(self, n):
  97. con = sqlite3.connect(self.index_path)
  98. cursor = con.execute("""
  99. SELECT DISTINCT term FROM terms
  100. ORDER BY random() LIMIT ?
  101. """)
  102. terms = [t[0] for t in cursor.fetchall()]
  103. return terms
  104. def run():
  105. indexer = Indexer(INDEX_PATH)
  106. indexer.create_if_not_exists()
  107. nlp = English()
  108. for path in glob(CRAWL_GLOB):
  109. print("Path", path)
  110. with gzip.open(path, 'rt') as html_file:
  111. url = html_file.readline().strip()
  112. content = html_file.read()
  113. if indexer.page_indexed(url):
  114. print("Page exists, skipping", url)
  115. continue
  116. cleaned_text = clean(content)
  117. try:
  118. title = bs4.BeautifulSoup(content, features="lxml").find('title').string
  119. except AttributeError:
  120. title = cleaned_text[:80]
  121. tokens = tokenize(nlp, cleaned_text)
  122. print("URL", url)
  123. print("Tokens", tokens)
  124. print("Title", title)
  125. indexer.index(tokens, url, title)
  126. def prepare_url_for_tokenizing(url: str):
  127. if url.startswith(HTTP_START):
  128. url = url[len(HTTP_START):]
  129. elif url.startswith(HTTPS_START):
  130. url = url[len(HTTPS_START):]
  131. for c in '/._':
  132. if c in url:
  133. url = url.replace(c, ' ')
  134. return url
  135. def get_pages(nlp, titles_and_urls):
  136. for i, (title_cleaned, url) in enumerate(titles_and_urls):
  137. title_tokens = tokenize(nlp, title_cleaned)
  138. prepared_url = prepare_url_for_tokenizing(unquote(url))
  139. url_tokens = tokenize(nlp, prepared_url)
  140. tokens = title_tokens | url_tokens
  141. yield Page(list(tokens), url, title_cleaned)
  142. if i % 1000 == 0:
  143. print("Processed", i)
  144. def grouper(n: int, iterator: Iterator):
  145. while True:
  146. chunk = tuple(islice(iterator, n))
  147. if not chunk:
  148. return
  149. yield chunk
  150. def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
  151. indexer.create_if_not_exists()
  152. pages = get_pages(nlp, titles_and_urls)
  153. for chunk in grouper(BATCH_SIZE, pages):
  154. indexer.index(list(chunk))
  155. if __name__ == '__main__':
  156. run()