From 32197431f79a86a309be8633b9a88a1c5cad32d8 Mon Sep 17 00:00:00 2001 From: Chris Tam Date: Thu, 28 Dec 2023 04:34:00 -0500 Subject: [PATCH] Support fast text fields (#180) --- src/schemabuilder.rs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/schemabuilder.rs b/src/schemabuilder.rs index abe8951..2c2d013 100644 --- a/src/schemabuilder.rs +++ b/src/schemabuilder.rs @@ -29,6 +29,7 @@ pub(crate) struct SchemaBuilder { pub(crate) builder: Arc>>, } +const NO_TOKENIZER_NAME: &str = "raw"; const TOKENIZER: &str = "default"; const RECORD: &str = "position"; @@ -53,6 +54,14 @@ impl SchemaBuilder { /// stored (bool, optional): If true sets the field as stored, the /// content of the field can be later restored from a Searcher. /// Defaults to False. + /// fast (bool, optional): Set the text options as a fast field. A + /// fast field is a column-oriented fashion storage for tantivy. + /// Text fast fields will have the term ids stored in the fast + /// field. The fast field will be a multivalued fast field. + /// It is recommended to use the "raw" tokenizer, since it will + /// store the original text unchanged. The "default" tokenizer will + /// store the terms as lower case and this will be reflected in the + /// dictionary. /// tokenizer_name (str, optional): The name of the tokenizer that /// should be used to process the field. Defaults to 'default' /// index_option (str, optional): Sets which information should be @@ -68,6 +77,7 @@ impl SchemaBuilder { #[pyo3(signature = ( name, stored = false, + fast = false, tokenizer_name = TOKENIZER, index_option = RECORD ))] @@ -75,12 +85,14 @@ impl SchemaBuilder { &mut self, name: &str, stored: bool, + fast: bool, tokenizer_name: &str, index_option: &str, ) -> PyResult { let builder = &mut self.builder; let options = SchemaBuilder::build_text_option( stored, + fast, tokenizer_name, index_option, )?; @@ -296,6 +308,14 @@ impl SchemaBuilder { /// stored (bool, optional): If true sets the field as stored, the /// content of the field can be later restored from a Searcher. /// Defaults to False. + /// fast (bool, optional): Set the text options as a fast field. A + /// fast field is a column-oriented fashion storage for tantivy. + /// Text fast fields will have the term ids stored in the fast + /// field. The fast field will be a multivalued fast field. + /// It is recommended to use the "raw" tokenizer, since it will + /// store the original text unchanged. The "default" tokenizer will + /// store the terms as lower case and this will be reflected in the + /// dictionary. /// tokenizer_name (str, optional): The name of the tokenizer that /// should be used to process the field. Defaults to 'default' /// index_option (str, optional): Sets which information should be @@ -311,6 +331,7 @@ impl SchemaBuilder { #[pyo3(signature = ( name, stored = false, + fast = false, tokenizer_name = TOKENIZER, index_option = RECORD ))] @@ -318,12 +339,14 @@ impl SchemaBuilder { &mut self, name: &str, stored: bool, + fast: bool, tokenizer_name: &str, index_option: &str, ) -> PyResult { let builder = &mut self.builder; let options = SchemaBuilder::build_text_option( stored, + fast, tokenizer_name, index_option, )?; @@ -482,6 +505,7 @@ impl SchemaBuilder { fn build_text_option( stored: bool, + fast: bool, tokenizer_name: &str, index_option: &str, ) -> PyResult { @@ -506,6 +530,17 @@ impl SchemaBuilder { options }; + let options = if fast { + let text_tokenizer = if tokenizer_name != NO_TOKENIZER_NAME { + Some(tokenizer_name) + } else { + None + }; + options.set_fast(text_tokenizer) + } else { + options + }; + Ok(options) } }