tantivy-py/src/schemabuilder.rs

#![allow(clippy::new_ret_no_self)]

use pyo3::{exceptions, prelude::*};

use tantivy::schema;

use crate::schema::Schema;
use std::sync::{Arc, RwLock};
use tantivy::schema::{DateOptions, INDEXED};

/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
/// stored or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
///
/// Examples:
///
///     >>> builder = tantivy.SchemaBuilder()
///
///     >>> title = builder.add_text_field("title", stored=True)
///     >>> body = builder.add_text_field("body")
///
///     >>> schema = builder.build()
#[pyclass]
#[derive(Clone)]
pub(crate) struct SchemaBuilder {
    pub(crate) builder: Arc<RwLock<Option<schema::SchemaBuilder>>>,
}

const TOKENIZER: &str = "default";
const RECORD: &str = "position";

#[pymethods]
impl SchemaBuilder {
    #[new]
    fn new() -> Self {
        SchemaBuilder {
            builder: Arc::new(From::from(Some(schema::Schema::builder()))),
        }
    }

    /// Add a new text field to the schema.
    ///
    /// Args:
    ///     name (str): The name of the field.
    ///     stored (bool, optional): If true sets the field as stored, the
    ///         content of the field can be later restored from a Searcher.
    ///         Defaults to False.
    ///     tokenizer_name (str, optional): The name of the tokenizer that
    ///         should be used to process the field. Defaults to 'default'
    ///     index_option (str, optional): Sets which information should be
    ///         indexed with the tokens. Can be one of 'position', 'freq' or
    ///         'basic'. Defaults to 'position'. The 'basic' index_option
    ///         records only the document ID, the 'freq' option records the
    ///         document id and the term frequency, while the 'position' option
    ///         records the document id, term frequency and the positions of
    ///         the term occurrences in the document.
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
    #[pyo3(signature = (
        name,
        stored = false,
        tokenizer_name = TOKENIZER,
        index_option = RECORD
    ))]
    fn add_text_field(
        &mut self,
        name: &str,
        stored: bool,
        tokenizer_name: &str,
        index_option: &str,
    ) -> PyResult<Self> {
        let builder = &mut self.builder;
        let options = SchemaBuilder::build_text_option(
            stored,
            tokenizer_name,
            index_option,
        )?;

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_text_field(name, options);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }
        Ok(self.clone())
    }

    /// Add a new signed integer field to the schema.
    ///
    /// Args:
    ///     name (str): The name of the field.
    ///     stored (bool, optional): If true sets the field as stored, the
    ///         content of the field can be later restored from a Searcher.
    ///         Defaults to False.
    ///     indexed (bool, optional): If true sets the field to be indexed.
    ///     fast (str, optional): Set the u64 options as a single-valued fast
    ///         field. Fast fields are designed for random access. Access time
    ///         are similar to a random lookup in an array. If more than one
    ///         value is associated to a fast field, only the last one is kept.
    ///         Can be one of 'single' or 'multi'. If this is set to 'single,
    ///         the document must have exactly one value associated to the
    ///         document. If this is set to 'multi', the document can have any
    ///         number of values associated to the document. Defaults to None,
    ///         which disables this option.
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
    #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
    fn add_integer_field(
        &mut self,
        name: &str,
        stored: bool,
        indexed: bool,
        fast: bool,
    ) -> PyResult<Self> {
        let builder = &mut self.builder;

        let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_i64_field(name, opts);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }
        Ok(self.clone())
    }

    #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
    fn add_float_field(
        &mut self,
        name: &str,
        stored: bool,
        indexed: bool,
        fast: bool,
    ) -> PyResult<Self> {
        let builder = &mut self.builder;

        let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_f64_field(name, opts);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }
        Ok(self.clone())
    }

    /// Add a new unsigned integer field to the schema.
    ///
    /// Args:
    ///     name (str): The name of the field.
    ///     stored (bool, optional): If true sets the field as stored, the
    ///         content of the field can be later restored from a Searcher.
    ///         Defaults to False.
    ///     indexed (bool, optional): If true sets the field to be indexed.
    ///     fast (str, optional): Set the u64 options as a single-valued fast
    ///         field. Fast fields are designed for random access. Access time
    ///         are similar to a random lookup in an array. If more than one
    ///         value is associated to a fast field, only the last one is kept.
    ///         Can be one of 'single' or 'multi'. If this is set to 'single,
    ///         the document must have exactly one value associated to the
    ///         document. If this is set to 'multi', the document can have any
    ///         number of values associated to the document. Defaults to None,
    ///         which disables this option.
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
    #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
    fn add_unsigned_field(
        &mut self,
        name: &str,
        stored: bool,
        indexed: bool,
        fast: bool,
    ) -> PyResult<Self> {
        let builder = &mut self.builder;

        let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_u64_field(name, opts);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }
        Ok(self.clone())
    }

    /// Add a new date field to the schema.
    ///
    /// Args:
    ///     name (str): The name of the field.
    ///     stored (bool, optional): If true sets the field as stored, the
    ///         content of the field can be later restored from a Searcher.
    ///         Defaults to False.
    ///     indexed (bool, optional): If true sets the field to be indexed.
    ///     fast (str, optional): Set the u64 options as a single-valued fast
    ///         field. Fast fields are designed for random access. Access time
    ///         are similar to a random lookup in an array. If more than one
    ///         value is associated to a fast field, only the last one is kept.
    ///         Can be one of 'single' or 'multi'. If this is set to 'single',
    ///         the document must have exactly one value associated to the
    ///         document. If this is set to 'multi', the document can have any
    ///         number of values associated to the document. Defaults to None,
    ///         which disables this option.
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
    #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
    fn add_date_field(
        &mut self,
        name: &str,
        stored: bool,
        indexed: bool,
        fast: bool,
    ) -> PyResult<Self> {
        let builder = &mut self.builder;

        let mut opts = DateOptions::default();
        if stored {
            opts = opts.set_stored();
        }
        if indexed {
            opts = opts.set_indexed();
        }
        if fast {
            opts = opts.set_fast();
        }

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_date_field(name, opts);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }
        Ok(self.clone())
    }

    /// Add a new json field to the schema.
    ///
    /// Args:
    ///     name (str): the name of the field.
    ///     stored (bool, optional): If true sets the field as stored, the
    ///         content of the field can be later restored from a Searcher.
    ///         Defaults to False.
    ///     tokenizer_name (str, optional): The name of the tokenizer that
    ///         should be used to process the field. Defaults to 'default'
    ///     index_option (str, optional): Sets which information should be
    ///         indexed with the tokens. Can be one of 'position', 'freq' or
    ///         'basic'. Defaults to 'position'. The 'basic' index_option
    ///         records only the document ID, the 'freq' option records the
    ///         document id and the term frequency, while the 'position' option
    ///         records the document id, term frequency and the positions of
    ///         the term occurrences in the document.
    ///
    /// Returns the associated field handle.
    /// Raises a ValueError if there was an error with the field creation.
    #[pyo3(signature = (
        name,
        stored = false,
        tokenizer_name = TOKENIZER,
        index_option = RECORD
    ))]
    fn add_json_field(
        &mut self,
        name: &str,
        stored: bool,
        tokenizer_name: &str,
        index_option: &str,
    ) -> PyResult<Self> {
        let builder = &mut self.builder;
        let options = SchemaBuilder::build_text_option(
            stored,
            tokenizer_name,
            index_option,
        )?;

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_json_field(name, options);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }

        Ok(self.clone())
    }

    /// Add a Facet field to the schema.
    /// Args:
    ///     name (str): The name of the field.
    fn add_facet_field(&mut self, name: &str) -> PyResult<Self> {
        let builder = &mut self.builder;

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_facet_field(name, INDEXED);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }
        Ok(self.clone())
    }

    /// Add a fast bytes field to the schema.
    ///
    /// Bytes field are not searchable and are only used
    /// as fast field, to associate any kind of payload
    /// to a document.
    ///
    /// Args:
    ///     name (str): The name of the field.
    fn add_bytes_field(&mut self, name: &str) -> PyResult<Self> {
        let builder = &mut self.builder;

        if let Some(builder) = builder.write().unwrap().as_mut() {
            builder.add_bytes_field(name, INDEXED);
        } else {
            return Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ));
        }
        Ok(self.clone())
    }

    /// Finalize the creation of a Schema.
    ///
    /// Returns a Schema object. After this is called the SchemaBuilder cannot
    /// be used anymore.
    fn build(&mut self) -> PyResult<Schema> {
        let builder = self.builder.write().unwrap().take();
        if let Some(builder) = builder {
            let schema = builder.build();
            Ok(Schema { inner: schema })
        } else {
            Err(exceptions::PyValueError::new_err(
                "Schema builder object isn't valid anymore.",
            ))
        }
    }
}

impl SchemaBuilder {
    fn build_numeric_option(
        stored: bool,
        indexed: bool,
        fast: bool,
    ) -> PyResult<schema::NumericOptions> {
        let opts = schema::NumericOptions::default();
        let opts = if stored { opts.set_stored() } else { opts };
        let opts = if indexed { opts.set_indexed() } else { opts };
        let opts = if fast { opts.set_fast() } else { opts };
        Ok(opts)
    }

    fn build_text_option(
        stored: bool,
        tokenizer_name: &str,
        index_option: &str,
    ) -> PyResult<schema::TextOptions> {
        let index_option = match index_option {
            "position" => schema::IndexRecordOption::WithFreqsAndPositions,
            "freq" => schema::IndexRecordOption::WithFreqs,
            "basic" => schema::IndexRecordOption::Basic,
            _ => return Err(exceptions::PyValueError::new_err(
                "Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
            ))
        };

        let indexing = schema::TextFieldIndexing::default()
            .set_tokenizer(tokenizer_name)
            .set_index_option(index_option);

        let options =
            schema::TextOptions::default().set_indexing_options(indexing);
        let options = if stored {
            options.set_stored()
        } else {
            options
        };

        Ok(options)
    }
}