tantivy-py/src/schemabuilder.rs

433 lines
15 KiB
Rust
Raw Normal View History

#![allow(clippy::new_ret_no_self)]
2020-09-20 08:19:29 +00:00
use pyo3::{exceptions, prelude::*};
use tantivy::schema;
use crate::schema::Schema;
2019-08-02 11:23:10 +00:00
use std::sync::{Arc, RwLock};
use tantivy::schema::{DateOptions, INDEXED};
/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
/// stored or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
///
/// Examples:
///
/// >>> builder = tantivy.SchemaBuilder()
///
/// >>> title = builder.add_text_field("title", stored=True)
/// >>> body = builder.add_text_field("body")
///
/// >>> schema = builder.build()
#[pyclass]
2019-08-02 11:23:10 +00:00
#[derive(Clone)]
pub(crate) struct SchemaBuilder {
2019-08-02 11:23:10 +00:00
pub(crate) builder: Arc<RwLock<Option<schema::SchemaBuilder>>>,
}
const TOKENIZER: &str = "default";
const RECORD: &str = "position";
#[pymethods]
impl SchemaBuilder {
#[new]
2020-04-19 11:35:14 +00:00
fn new() -> Self {
SchemaBuilder {
2019-08-02 11:23:10 +00:00
builder: Arc::new(From::from(Some(schema::Schema::builder()))),
2020-04-19 11:35:14 +00:00
}
}
/// Add a new text field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// tokenizer_name (str, optional): The name of the tokenizer that
/// should be used to process the field. Defaults to 'default'
/// index_option (str, optional): Sets which information should be
/// indexed with the tokens. Can be one of 'position', 'freq' or
/// 'basic'. Defaults to 'position'. The 'basic' index_option
/// records only the document ID, the 'freq' option records the
/// document id and the term frequency, while the 'position' option
/// records the document id, term frequency and the positions of
/// the term occurrences in the document.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[pyo3(signature = (
name,
stored = false,
tokenizer_name = TOKENIZER,
index_option = RECORD
))]
fn add_text_field(
&mut self,
name: &str,
stored: bool,
tokenizer_name: &str,
index_option: &str,
2019-08-02 11:23:10 +00:00
) -> PyResult<Self> {
let builder = &mut self.builder;
2022-04-15 03:50:37 +00:00
let options = SchemaBuilder::build_text_option(
stored,
tokenizer_name,
index_option,
)?;
2019-08-02 11:23:10 +00:00
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_text_field(name, options);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
2019-08-02 11:23:10 +00:00
));
}
2019-08-02 11:23:10 +00:00
Ok(self.clone())
}
/// Add a new signed integer field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the numeric options as a fast field. A
/// fast field is a column-oriented fashion storage for tantivy.
/// It is designed for the fast random access of some document
/// fields given a document id.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
fn add_integer_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: bool,
2019-08-02 11:23:10 +00:00
) -> PyResult<Self> {
let builder = &mut self.builder;
2023-01-17 00:43:39 +00:00
let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
2019-08-02 11:23:10 +00:00
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_i64_field(name, opts);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
2019-08-02 11:23:10 +00:00
));
}
2019-08-02 11:23:10 +00:00
Ok(self.clone())
}
/// Add a new float field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the numeric options as a fast field. A
/// fast field is a column-oriented fashion storage for tantivy.
/// It is designed for the fast random access of some document
/// fields given a document id.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
2023-01-17 00:43:39 +00:00
fn add_float_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: bool,
2023-01-17 00:43:39 +00:00
) -> PyResult<Self> {
let builder = &mut self.builder;
let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_f64_field(name, opts);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
));
}
Ok(self.clone())
}
/// Add a new unsigned integer field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the numeric options as a fast field. A
/// fast field is a column-oriented fashion storage for tantivy.
/// It is designed for the fast random access of some document
/// fields given a document id.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
fn add_unsigned_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: bool,
2019-08-02 11:23:10 +00:00
) -> PyResult<Self> {
let builder = &mut self.builder;
2023-01-17 00:43:39 +00:00
let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
2019-08-02 11:23:10 +00:00
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_u64_field(name, opts);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
2019-08-02 11:23:10 +00:00
));
}
2019-08-02 11:23:10 +00:00
Ok(self.clone())
}
/// Add a new boolean field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the numeric options as a fast field. A
/// fast field is a column-oriented fashion storage for tantivy.
/// It is designed for the fast random access of some document
/// fields given a document id.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
fn add_boolean_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: bool,
) -> PyResult<Self> {
let builder = &mut self.builder;
let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_bool_field(name, opts);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
));
}
Ok(self.clone())
}
/// Add a new date field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the date options as a fast field. A fast
/// field is a column-oriented fashion storage for tantivy. It is
/// designed for the fast random access of some document fields
/// given a document id.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
fn add_date_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: bool,
2019-08-02 11:23:10 +00:00
) -> PyResult<Self> {
let builder = &mut self.builder;
let mut opts = DateOptions::default();
if stored {
opts = opts.set_stored();
}
if indexed {
opts = opts.set_indexed();
}
if fast {
opts = opts.set_fast();
}
2019-08-02 11:23:10 +00:00
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_date_field(name, opts);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
2019-08-02 11:23:10 +00:00
));
}
2019-08-02 11:23:10 +00:00
Ok(self.clone())
}
2022-04-15 03:50:37 +00:00
/// Add a new json field to the schema.
///
/// Args:
/// name (str): the name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// tokenizer_name (str, optional): The name of the tokenizer that
/// should be used to process the field. Defaults to 'default'
/// index_option (str, optional): Sets which information should be
/// indexed with the tokens. Can be one of 'position', 'freq' or
/// 'basic'. Defaults to 'position'. The 'basic' index_option
/// records only the document ID, the 'freq' option records the
/// document id and the term frequency, while the 'position' option
/// records the document id, term frequency and the positions of
/// the term occurrences in the document.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[pyo3(signature = (
name,
2022-04-15 03:50:37 +00:00
stored = false,
tokenizer_name = TOKENIZER,
index_option = RECORD
))]
2022-04-15 03:50:37 +00:00
fn add_json_field(
&mut self,
name: &str,
stored: bool,
tokenizer_name: &str,
index_option: &str,
) -> PyResult<Self> {
let builder = &mut self.builder;
let options = SchemaBuilder::build_text_option(
stored,
tokenizer_name,
index_option,
)?;
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_json_field(name, options);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
));
}
Ok(self.clone())
}
/// Add a Facet field to the schema.
/// Args:
/// name (str): The name of the field.
2019-08-02 11:23:10 +00:00
fn add_facet_field(&mut self, name: &str) -> PyResult<Self> {
let builder = &mut self.builder;
2019-08-02 11:23:10 +00:00
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_facet_field(name, INDEXED);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
2019-08-02 11:23:10 +00:00
));
}
2019-08-02 11:23:10 +00:00
Ok(self.clone())
}
/// Add a fast bytes field to the schema.
///
/// Bytes field are not searchable and are only used
/// as fast field, to associate any kind of payload
/// to a document.
///
/// Args:
/// name (str): The name of the field.
2019-08-02 11:23:10 +00:00
fn add_bytes_field(&mut self, name: &str) -> PyResult<Self> {
let builder = &mut self.builder;
2019-08-02 11:23:10 +00:00
if let Some(builder) = builder.write().unwrap().as_mut() {
builder.add_bytes_field(name, INDEXED);
} else {
return Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
2019-08-02 11:23:10 +00:00
));
}
2019-08-02 11:23:10 +00:00
Ok(self.clone())
}
/// Finalize the creation of a Schema.
///
/// Returns a Schema object. After this is called the SchemaBuilder cannot
/// be used anymore.
fn build(&mut self) -> PyResult<Schema> {
2019-08-02 11:23:10 +00:00
let builder = self.builder.write().unwrap().take();
if let Some(builder) = builder {
let schema = builder.build();
Ok(Schema { inner: schema })
} else {
Err(exceptions::PyValueError::new_err(
"Schema builder object isn't valid anymore.",
))
}
}
}
impl SchemaBuilder {
2023-01-17 00:43:39 +00:00
fn build_numeric_option(
stored: bool,
indexed: bool,
fast: bool,
2022-04-15 03:50:37 +00:00
) -> PyResult<schema::NumericOptions> {
let opts = schema::NumericOptions::default();
let opts = if stored { opts.set_stored() } else { opts };
let opts = if indexed { opts.set_indexed() } else { opts };
let opts = if fast { opts.set_fast() } else { opts };
Ok(opts)
}
2022-04-15 03:50:37 +00:00
fn build_text_option(
stored: bool,
tokenizer_name: &str,
index_option: &str,
) -> PyResult<schema::TextOptions> {
let index_option = match index_option {
"position" => schema::IndexRecordOption::WithFreqsAndPositions,
"freq" => schema::IndexRecordOption::WithFreqs,
"basic" => schema::IndexRecordOption::Basic,
_ => return Err(exceptions::PyValueError::new_err(
"Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
))
};
let indexing = schema::TextFieldIndexing::default()
.set_tokenizer(tokenizer_name)
.set_index_option(index_option);
let options =
schema::TextOptions::default().set_indexing_options(indexing);
let options = if stored {
options.set_stored()
} else {
options
};
Ok(options)
}
}