index.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. """
  2. Create a search index
  3. """
  4. from collections import Counter
  5. from itertools import islice
  6. from typing import Iterator, Iterable
  7. from urllib.parse import unquote
  8. import pandas as pd
  9. # NUM_PAGES = 8192
  10. # PAGE_SIZE = 512
  11. from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
  12. NUM_INITIAL_TOKENS = 50
  13. HTTP_START = 'http://'
  14. HTTPS_START = 'https://'
  15. BATCH_SIZE = 100
  16. def is_content_token(nlp, token):
  17. lexeme = nlp.vocab[token.orth]
  18. return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
  19. def tokenize(nlp, cleaned_text):
  20. tokens = nlp.tokenizer(cleaned_text)
  21. content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
  22. if is_content_token(nlp, token)]
  23. lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
  24. return lowered
  25. def prepare_url_for_tokenizing(url: str):
  26. if url.startswith(HTTP_START):
  27. url = url[len(HTTP_START):]
  28. elif url.startswith(HTTPS_START):
  29. url = url[len(HTTPS_START):]
  30. for c in '/._':
  31. if c in url:
  32. url = url.replace(c, ' ')
  33. return url
  34. def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
  35. for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
  36. title_tokens = tokenize(nlp, title_cleaned)
  37. prepared_url = prepare_url_for_tokenizing(unquote(url))
  38. url_tokens = tokenize(nlp, prepared_url)
  39. extract_tokens = tokenize(nlp, extract)
  40. print("Extract tokens", extract_tokens)
  41. tokens = title_tokens | url_tokens | extract_tokens
  42. yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
  43. if i % 1000 == 0:
  44. print("Processed", i)
  45. def grouper(n: int, iterator: Iterator):
  46. while True:
  47. chunk = tuple(islice(iterator, n))
  48. if not chunk:
  49. return
  50. yield chunk
  51. def index_titles_urls_and_extracts(indexer: TinyIndexer, nlp, titles_urls_and_extracts, terms_path):
  52. indexer.create_if_not_exists()
  53. terms = Counter()
  54. pages = get_pages(nlp, titles_urls_and_extracts)
  55. for page in pages:
  56. for token in page.tokens:
  57. indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
  58. terms.update([t.lower() for t in page.tokens])
  59. term_df = pd.DataFrame({
  60. 'term': terms.keys(),
  61. 'count': terms.values(),
  62. })
  63. term_df.to_csv(terms_path)