|
@@ -27,17 +27,19 @@
|
|
//! but the principle is the same.
|
|
//! but the principle is the same.
|
|
|
|
|
|
mod indexing;
|
|
mod indexing;
|
|
|
|
+mod key_phrase;
|
|
|
|
+mod retrieved_webpage;
|
|
mod search;
|
|
mod search;
|
|
|
|
|
|
pub use indexing::merge_tantivy_segments;
|
|
pub use indexing::merge_tantivy_segments;
|
|
-
|
|
|
|
-use chrono::{DateTime, NaiveDateTime};
|
|
|
|
|
|
+pub use key_phrase::KeyPhrase;
|
|
|
|
+pub use retrieved_webpage::RetrievedWebpage;
|
|
|
|
|
|
use tantivy::directory::MmapDirectory;
|
|
use tantivy::directory::MmapDirectory;
|
|
|
|
|
|
-use tantivy::schema::{Schema, Value};
|
|
|
|
|
|
+use tantivy::schema::Schema;
|
|
use tantivy::tokenizer::TokenizerManager;
|
|
use tantivy::tokenizer::TokenizerManager;
|
|
-use tantivy::{IndexReader, IndexWriter, TantivyDocument};
|
|
|
|
|
|
+use tantivy::{IndexReader, IndexWriter};
|
|
|
|
|
|
use crate::collector::{approx_count, Hashes};
|
|
use crate::collector::{approx_count, Hashes};
|
|
use crate::config::SnippetConfig;
|
|
use crate::config::SnippetConfig;
|
|
@@ -45,15 +47,10 @@ use crate::numericalfield_reader::NumericalFieldReader;
|
|
|
|
|
|
use crate::ranking::initial::Score;
|
|
use crate::ranking::initial::Score;
|
|
|
|
|
|
-use crate::schema::text_field::TextField;
|
|
|
|
-use crate::schema::{numerical_field, text_field, Field, NumericalFieldEnum, TextFieldEnum};
|
|
|
|
-use crate::snippet::TextSnippet;
|
|
|
|
|
|
+use crate::schema::{numerical_field, Field, NumericalFieldEnum};
|
|
use crate::tokenizer::fields::{
|
|
use crate::tokenizer::fields::{
|
|
- BigramTokenizer, Identity, JsonField, Stemmed, TrigramTokenizer, UrlTokenizer,
|
|
|
|
|
|
+ BigramTokenizer, Identity, JsonField, NewlineTokenizer, Stemmed, TrigramTokenizer, UrlTokenizer,
|
|
};
|
|
};
|
|
-use crate::webpage::region::Region;
|
|
|
|
-
|
|
|
|
-use crate::webpage::schema_org;
|
|
|
|
use crate::Result;
|
|
use crate::Result;
|
|
use crate::{schema::create_schema, tokenizer::FieldTokenizer};
|
|
use crate::{schema::create_schema, tokenizer::FieldTokenizer};
|
|
use std::fs;
|
|
use std::fs;
|
|
@@ -129,6 +126,9 @@ fn register_tokenizers(manager: &TokenizerManager) {
|
|
|
|
|
|
let tokenizer = FieldTokenizer::Json(JsonField);
|
|
let tokenizer = FieldTokenizer::Json(JsonField);
|
|
manager.register(tokenizer.as_str(), tokenizer);
|
|
manager.register(tokenizer.as_str(), tokenizer);
|
|
|
|
+
|
|
|
|
+ let tokenizer = FieldTokenizer::Newline(NewlineTokenizer::default());
|
|
|
|
+ manager.register(tokenizer.as_str(), tokenizer);
|
|
}
|
|
}
|
|
|
|
|
|
pub struct InvertedIndex {
|
|
pub struct InvertedIndex {
|
|
@@ -169,7 +169,6 @@ impl InvertedIndex {
|
|
register_tokenizers(tantivy_index.tokenizers());
|
|
register_tokenizers(tantivy_index.tokenizers());
|
|
|
|
|
|
let reader: IndexReader = tantivy_index.reader_builder().try_into()?;
|
|
let reader: IndexReader = tantivy_index.reader_builder().try_into()?;
|
|
-
|
|
|
|
let columnfield_reader = NumericalFieldReader::new(&reader.searcher());
|
|
let columnfield_reader = NumericalFieldReader::new(&reader.searcher());
|
|
|
|
|
|
Ok(InvertedIndex {
|
|
Ok(InvertedIndex {
|
|
@@ -220,116 +219,6 @@ pub struct SearchResult {
|
|
pub documents: Vec<RetrievedWebpage>,
|
|
pub documents: Vec<RetrievedWebpage>,
|
|
}
|
|
}
|
|
|
|
|
|
-#[derive(
|
|
|
|
- Default,
|
|
|
|
- Debug,
|
|
|
|
- Clone,
|
|
|
|
- serde::Serialize,
|
|
|
|
- serde::Deserialize,
|
|
|
|
- bincode::Encode,
|
|
|
|
- bincode::Decode,
|
|
|
|
- PartialEq,
|
|
|
|
-)]
|
|
|
|
-pub struct RetrievedWebpage {
|
|
|
|
- pub title: String,
|
|
|
|
- pub url: String,
|
|
|
|
- pub body: String,
|
|
|
|
- pub snippet: TextSnippet,
|
|
|
|
- pub dirty_body: String,
|
|
|
|
- pub description: Option<String>,
|
|
|
|
- pub dmoz_description: Option<String>,
|
|
|
|
- #[bincode(with_serde)]
|
|
|
|
- pub updated_time: Option<NaiveDateTime>,
|
|
|
|
- pub schema_org: Vec<schema_org::Item>,
|
|
|
|
- pub region: Region,
|
|
|
|
- pub likely_has_ads: bool,
|
|
|
|
- pub likely_has_paywall: bool,
|
|
|
|
- pub recipe_first_ingredient_tag_id: Option<String>,
|
|
|
|
- pub keywords: Vec<String>,
|
|
|
|
-}
|
|
|
|
-impl RetrievedWebpage {
|
|
|
|
- pub fn description(&self) -> Option<&String> {
|
|
|
|
- self.description.as_ref().or(self.dmoz_description.as_ref())
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-fn str_value(name: &str, value: &tantivy::schema::document::CompactDocValue) -> String {
|
|
|
|
- value
|
|
|
|
- .as_str()
|
|
|
|
- .unwrap_or_else(|| panic!("{} field should be text", name))
|
|
|
|
- .to_string()
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-impl From<TantivyDocument> for RetrievedWebpage {
|
|
|
|
- fn from(doc: TantivyDocument) -> Self {
|
|
|
|
- let mut webpage = RetrievedWebpage::default();
|
|
|
|
-
|
|
|
|
- for (field, value) in doc.field_values() {
|
|
|
|
- match Field::get(field.field_id() as usize) {
|
|
|
|
- Some(Field::Text(TextFieldEnum::Title(_))) => {
|
|
|
|
- webpage.title = str_value(text_field::Title.name(), &value);
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::StemmedCleanBody(_))) => {
|
|
|
|
- webpage.body = str_value(text_field::StemmedCleanBody.name(), &value);
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::Description(_))) => {
|
|
|
|
- let desc = str_value(text_field::Description.name(), &value);
|
|
|
|
- webpage.description = if desc.is_empty() { None } else { Some(desc) }
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::Url(_))) => {
|
|
|
|
- webpage.url = str_value(text_field::Url.name(), &value);
|
|
|
|
- }
|
|
|
|
- Some(Field::Numerical(NumericalFieldEnum::LastUpdated(_))) => {
|
|
|
|
- webpage.updated_time = {
|
|
|
|
- let timestamp = value.as_u64().unwrap() as i64;
|
|
|
|
- if timestamp == 0 {
|
|
|
|
- None
|
|
|
|
- } else {
|
|
|
|
- DateTime::from_timestamp(timestamp, 0).map(|dt| dt.naive_utc())
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::AllBody(_))) => {
|
|
|
|
- webpage.dirty_body = str_value(text_field::AllBody.name(), &value);
|
|
|
|
- }
|
|
|
|
- Some(Field::Numerical(NumericalFieldEnum::Region(_))) => {
|
|
|
|
- webpage.region = {
|
|
|
|
- let id = value.as_u64().unwrap();
|
|
|
|
- Region::from_id(id)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::DmozDescription(_))) => {
|
|
|
|
- let desc = str_value(text_field::DmozDescription.name(), &value);
|
|
|
|
- webpage.dmoz_description = if desc.is_empty() { None } else { Some(desc) }
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::SchemaOrgJson(_))) => {
|
|
|
|
- let json = str_value(text_field::SchemaOrgJson.name(), &value);
|
|
|
|
- webpage.schema_org = serde_json::from_str(&json).unwrap_or_default();
|
|
|
|
- }
|
|
|
|
- Some(Field::Numerical(NumericalFieldEnum::LikelyHasAds(_))) => {
|
|
|
|
- webpage.likely_has_ads = value.as_bool().unwrap_or_default();
|
|
|
|
- }
|
|
|
|
- Some(Field::Numerical(NumericalFieldEnum::LikelyHasPaywall(_))) => {
|
|
|
|
- webpage.likely_has_paywall = value.as_bool().unwrap_or_default();
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::RecipeFirstIngredientTagId(_))) => {
|
|
|
|
- let tag_id = str_value(text_field::RecipeFirstIngredientTagId.name(), &value);
|
|
|
|
- if !tag_id.is_empty() {
|
|
|
|
- webpage.recipe_first_ingredient_tag_id = Some(tag_id);
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- Some(Field::Text(TextFieldEnum::Keywords(_))) => {
|
|
|
|
- let keywords = str_value(text_field::Keywords.name(), &value);
|
|
|
|
- webpage.keywords = keywords.split('\n').map(|s| s.to_string()).collect();
|
|
|
|
- }
|
|
|
|
- _ => {}
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- webpage
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
#[cfg(test)]
|
|
#[cfg(test)]
|
|
mod tests {
|
|
mod tests {
|
|
use candle_core::Tensor;
|
|
use candle_core::Tensor;
|
|
@@ -343,7 +232,7 @@ mod tests {
|
|
ranking::{Ranker, SignalComputer},
|
|
ranking::{Ranker, SignalComputer},
|
|
search_ctx::Ctx,
|
|
search_ctx::Ctx,
|
|
searcher::SearchQuery,
|
|
searcher::SearchQuery,
|
|
- webpage::{Html, Webpage},
|
|
|
|
|
|
+ webpage::{schema_org, Html, Webpage},
|
|
OneOrMany,
|
|
OneOrMany,
|
|
};
|
|
};
|
|
|
|
|