tantivy-py/src/searcher.rs

#![allow(clippy::new_ret_no_self)]

use crate::document::Document;
use crate::query::Query;
use crate::{get_field, to_pyerr};
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::PyObjectProtocol;
use tantivy as tv;
use tantivy::collector::{Count, MultiCollector, TopDocs};

/// Tantivy's Searcher class
///
/// A Searcher is used to search the index given a prepared Query.
#[pyclass]
pub(crate) struct Searcher {
    pub(crate) inner: tv::LeasedItem<tv::Searcher>,
}

#[derive(Clone)]
enum Fruit {
    Score(f32),
    Order(u64),
}

impl std::fmt::Debug for Fruit {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Fruit::Score(s) => f.write_str(&format!("{}", s)),
            Fruit::Order(o) => f.write_str(&format!("{}", o)),
        }
    }
}

impl ToPyObject for Fruit {
    fn to_object(&self, py: Python) -> PyObject {
        match self {
            Fruit::Score(s) => s.to_object(py),
            Fruit::Order(o) => o.to_object(py),
        }
    }
}

#[pyclass]
/// Object holding a results successful search.
pub(crate) struct SearchResult {
    hits: Vec<(Fruit, DocAddress)>,
    #[pyo3(get)]
    /// How many documents matched the query. Only available if `count` was set
    /// to true during the search.
    count: Option<usize>,
}

#[pyproto]
impl PyObjectProtocol for SearchResult {
    fn __repr__(&self) -> PyResult<String> {
        if let Some(count) = self.count {
            Ok(format!(
                "SearchResult(hits: {:?}, count: {})",
                self.hits, count
            ))
        } else {
            Ok(format!("SearchResult(hits: {:?})", self.hits))
        }
    }
}

#[pymethods]
impl SearchResult {
    #[getter]
    /// The list of tuples that contains the scores and DocAddress of the
    /// search results.
    fn hits(&self, py: Python) -> PyResult<Vec<(PyObject, DocAddress)>> {
        let ret: Vec<(PyObject, DocAddress)> = self
            .hits
            .iter()
            .map(|(result, address)| (result.to_object(py), address.clone()))
            .collect();
        Ok(ret)
    }
}

#[pymethods]
impl Searcher {
    /// Search the index with the given query and collect results.
    ///
    /// Args:
    ///     query (Query): The query that will be used for the search.
    ///     limit (int, optional): The maximum number of search results to
    ///         return. Defaults to 10.
    ///     count (bool, optional): Should the number of documents that match
    ///         the query be returned as well. Defaults to true.
    ///     order_by_field (Field, optional): A schema field that the results
    ///         should be ordered by. The field must be declared as a fast field
    ///         when building the schema. Note, this only works for unsigned
    ///         fields.
    ///     offset (Field, optional): The offset from which the results have
    ///         to be returned.
    ///
    /// Returns `SearchResult` object.
    ///
    /// Raises a ValueError if there was an error with the search.
    #[args(limit = 10, offset = 0, count = true)]
    fn search(
        &self,
        _py: Python,
        query: &Query,
        limit: usize,
        count: bool,
        order_by_field: Option<&str>,
        offset: usize,
    ) -> PyResult<SearchResult> {
        let mut multicollector = MultiCollector::new();

        let count_handle = if count {
            Some(multicollector.add_collector(Count))
        } else {
            None
        };

        let (mut multifruit, hits) = {
            if let Some(order_by) = order_by_field {
                let field = get_field(&self.inner.index().schema(), order_by)?;
                let collector = TopDocs::with_limit(limit)
                    .and_offset(offset)
                    .order_by_u64_field(field);
                let top_docs_handle = multicollector.add_collector(collector);
                let ret = self.inner.search(&query.get(), &multicollector);

                match ret {
                    Ok(mut r) => {
                        let top_docs = top_docs_handle.extract(&mut r);
                        let result: Vec<(Fruit, DocAddress)> = top_docs
                            .iter()
                            .map(|(f, d)| {
                                (Fruit::Order(*f), DocAddress::from(d))
                            })
                            .collect();
                        (r, result)
                    }
                    Err(e) => return Err(PyValueError::new_err(e.to_string())),
                }
            } else {
                let collector = TopDocs::with_limit(limit).and_offset(offset);
                let top_docs_handle = multicollector.add_collector(collector);
                let ret = self.inner.search(&query.get(), &multicollector);

                match ret {
                    Ok(mut r) => {
                        let top_docs = top_docs_handle.extract(&mut r);
                        let result: Vec<(Fruit, DocAddress)> = top_docs
                            .iter()
                            .map(|(f, d)| {
                                (Fruit::Score(*f), DocAddress::from(d))
                            })
                            .collect();
                        (r, result)
                    }
                    Err(e) => return Err(PyValueError::new_err(e.to_string())),
                }
            }
        };

        let count = match count_handle {
            Some(h) => Some(h.extract(&mut multifruit)),
            None => None,
        };

        Ok(SearchResult { hits, count })
    }

    /// Returns the overall number of documents in the index.
    #[getter]
    fn num_docs(&self) -> u64 {
        self.inner.num_docs()
    }

    /// Fetches a document from Tantivy's store given a DocAddress.
    ///
    /// Args:
    ///     doc_address (DocAddress): The DocAddress that is associated with
    ///         the document that we wish to fetch.
    ///
    /// Returns the Document, raises ValueError if the document can't be found.
    fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
        let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
        let named_doc = self.inner.schema().to_named_doc(&doc);
        Ok(Document {
            field_values: named_doc.0,
        })
    }
}

/// DocAddress contains all the necessary information to identify a document
/// given a Searcher object.
///
/// It consists in an id identifying its segment, and its segment-local DocId.
/// The id used for the segment is actually an ordinal in the list of segment
/// hold by a Searcher.
#[pyclass]
#[derive(Clone, Debug)]
pub(crate) struct DocAddress {
    pub(crate) segment_ord: tv::SegmentLocalId,
    pub(crate) doc: tv::DocId,
}

#[pymethods]
impl DocAddress {
    /// The segment ordinal is an id identifying the segment hosting the
    /// document. It is only meaningful, in the context of a searcher.
    #[getter]
    fn segment_ord(&self) -> u32 {
        self.segment_ord
    }

    /// The segment local DocId
    #[getter]
    fn doc(&self) -> u32 {
        self.doc
    }
}

impl From<&tv::DocAddress> for DocAddress {
    fn from(doc_address: &tv::DocAddress) -> Self {
        DocAddress {
            segment_ord: doc_address.segment_ord(),
            doc: doc_address.doc(),
        }
    }
}

impl Into<tv::DocAddress> for &DocAddress {
    fn into(self) -> tv::DocAddress {
        tv::DocAddress(self.segment_ord(), self.doc())
    }
}

#[pyproto]
impl PyObjectProtocol for Searcher {
    fn __repr__(&self) -> PyResult<String> {
        Ok(format!(
            "Searcher(num_docs={}, num_segments={})",
            self.inner.num_docs(),
            self.inner.segment_readers().len()
        ))
    }
}
tantivy: Disable a clippy warning about new not returning self. The new() method using pyo3 does not return self, pyo3 handles some python magic for us instead. Disable the clippy warning about not returning self in new since it does not apply to us. 2019-08-08 11:55:40 +00:00			`#![allow(clippy::new_ret_no_self)]`

Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`use crate::document::Document;`
			`use crate::query::Query;`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`use crate::{get_field, to_pyerr};`
tantivy-py: Upgrade PyO3. This removes our nightly requirement but sadly it adds a requirement for things that are kept inside a Python class to be Send. Luckily for us almost everything in Tantivy is Send, except for the Query trait. This patch works around this by keeping the parser and query string inside our python Query object. This sadly means that we are going to parse the query string twice. 2020-09-19 09:36:31 +00:00			`use pyo3::exceptions::PyValueError;`
API simplification. See #9 2019-08-02 11:23:10 +00:00			`use pyo3::prelude::*;`
searcher: Allow the search to be sorted by an unsigned field. 2019-10-01 18:56:42 +00:00			`use pyo3::PyObjectProtocol;`
API simplification. See #9 2019-08-02 11:23:10 +00:00			`use tantivy as tv;`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`use tantivy::collector::{Count, MultiCollector, TopDocs};`
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00
			`/// Tantivy's Searcher class`
			`///`
			`/// A Searcher is used to search the index given a prepared Query.`
			`#[pyclass]`
			`pub(crate) struct Searcher {`
			`pub(crate) inner: tv::LeasedItem<tv::Searcher>,`
			`}`

searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`#[derive(Clone)]`
			`enum Fruit {`
			`Score(f32),`
			`Order(u64),`
			`}`

			`impl std::fmt::Debug for Fruit {`
			`fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {`
			`match self {`
			`Fruit::Score(s) => f.write_str(&format!("{}", s)),`
			`Fruit::Order(o) => f.write_str(&format!("{}", o)),`
			`}`
			`}`
			`}`

			`impl ToPyObject for Fruit {`
			`fn to_object(&self, py: Python) -> PyObject {`
			`match self {`
			`Fruit::Score(s) => s.to_object(py),`
			`Fruit::Order(o) => o.to_object(py),`
			`}`
			`}`
			`}`

searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`#[pyclass]`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`/// Object holding a results successful search.`
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`pub(crate) struct SearchResult {`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`hits: Vec<(Fruit, DocAddress)>,`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`#[pyo3(get)]`
			/// How many documents matched the query. Only available if `count` was set
			`/// to true during the search.`
			`count: Option<usize>,`
			`}`

searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`#[pyproto]`
			`impl PyObjectProtocol for SearchResult {`
			`fn __repr__(&self) -> PyResult<String> {`
			`if let Some(count) = self.count {`
			`Ok(format!(`
			`"SearchResult(hits: {:?}, count: {})",`
			`self.hits, count`
			`))`
			`} else {`
			`Ok(format!("SearchResult(hits: {:?})", self.hits))`
			`}`
			`}`
			`}`

searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`#[pymethods]`
			`impl SearchResult {`
			`#[getter]`
			`/// The list of tuples that contains the scores and DocAddress of the`
			`/// search results.`
			`fn hits(&self, py: Python) -> PyResult<Vec<(PyObject, DocAddress)>> {`
			`let ret: Vec<(PyObject, DocAddress)> = self`
			`.hits`
			`.iter()`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`.map(\|(result, address)\| (result.to_object(py), address.clone()))`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`.collect();`
			`Ok(ret)`
			`}`
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`}`

Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`#[pymethods]`
			`impl Searcher {`
			`/// Search the index with the given query and collect results.`
			`///`
			`/// Args:`
			`/// query (Query): The query that will be used for the search.`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`/// limit (int, optional): The maximum number of search results to`
			`/// return. Defaults to 10.`
			`/// count (bool, optional): Should the number of documents that match`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`/// the query be returned as well. Defaults to true.`
			`/// order_by_field (Field, optional): A schema field that the results`
			`/// should be ordered by. The field must be declared as a fast field`
			`/// when building the schema. Note, this only works for unsigned`
			`/// fields.`
updated tests for offset query 2020-09-06 10:26:17 +00:00			`/// offset (Field, optional): The offset from which the results have`
changed the args position 2020-09-06 10:07:05 +00:00			`/// to be returned.`
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`///`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			/// Returns `SearchResult` object.
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`///`
			`/// Raises a ValueError if there was an error with the search.`
support for offset to do record pagination 2020-09-05 15:51:54 +00:00			`#[args(limit = 10, offset = 0, count = true)]`
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`fn search(`
			`&self,`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`_py: Python,`
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`query: &Query,`
searcher: Remove the collector concept. This patch removes the TopDocs collector class and adds a limit argument on the search method. 2019-10-01 18:05:38 +00:00			`limit: usize,`
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`count: bool,`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`order_by_field: Option<&str>,`
updated tests for offset query 2020-09-06 10:26:17 +00:00			`offset: usize,`
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`) -> PyResult<SearchResult> {`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`let mut multicollector = MultiCollector::new();`
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00
			`let count_handle = if count {`
			`Some(multicollector.add_collector(Count))`
searcher: Allow the search to be sorted by an unsigned field. 2019-10-01 18:56:42 +00:00			`} else {`
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`None`
			`};`

searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`let (mut multifruit, hits) = {`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`if let Some(order_by) = order_by_field {`
			`let field = get_field(&self.inner.index().schema(), order_by)?;`
formated the searcher.rs file 2020-09-05 15:58:59 +00:00			`let collector = TopDocs::with_limit(limit)`
			`.and_offset(offset)`
			`.order_by_u64_field(field);`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`let top_docs_handle = multicollector.add_collector(collector);`
tantivy-py: Upgrade PyO3. This removes our nightly requirement but sadly it adds a requirement for things that are kept inside a Python class to be Send. Luckily for us almost everything in Tantivy is Send, except for the Query trait. This patch works around this by keeping the parser and query string inside our python Query object. This sadly means that we are going to parse the query string twice. 2020-09-19 09:36:31 +00:00			`let ret = self.inner.search(&query.get(), &multicollector);`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00
			`match ret {`
			`Ok(mut r) => {`
			`let top_docs = top_docs_handle.extract(&mut r);`
			`let result: Vec<(Fruit, DocAddress)> = top_docs`
			`.iter()`
			`.map(\|(f, d)\| {`
			`(Fruit::Order(*f), DocAddress::from(d))`
			`})`
			`.collect();`
			`(r, result)`
			`}`
tantivy-py: Upgrade PyO3. This removes our nightly requirement but sadly it adds a requirement for things that are kept inside a Python class to be Send. Luckily for us almost everything in Tantivy is Send, except for the Query trait. This patch works around this by keeping the parser and query string inside our python Query object. This sadly means that we are going to parse the query string twice. 2020-09-19 09:36:31 +00:00			`Err(e) => return Err(PyValueError::new_err(e.to_string())),`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`}`
			`} else {`
support for offset to do record pagination 2020-09-05 15:51:54 +00:00			`let collector = TopDocs::with_limit(limit).and_offset(offset);`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`let top_docs_handle = multicollector.add_collector(collector);`
tantivy-py: Upgrade PyO3. This removes our nightly requirement but sadly it adds a requirement for things that are kept inside a Python class to be Send. Luckily for us almost everything in Tantivy is Send, except for the Query trait. This patch works around this by keeping the parser and query string inside our python Query object. This sadly means that we are going to parse the query string twice. 2020-09-19 09:36:31 +00:00			`let ret = self.inner.search(&query.get(), &multicollector);`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00
			`match ret {`
			`Ok(mut r) => {`
			`let top_docs = top_docs_handle.extract(&mut r);`
			`let result: Vec<(Fruit, DocAddress)> = top_docs`
			`.iter()`
			`.map(\|(f, d)\| {`
			`(Fruit::Score(*f), DocAddress::from(d))`
			`})`
			`.collect();`
			`(r, result)`
			`}`
tantivy-py: Upgrade PyO3. This removes our nightly requirement but sadly it adds a requirement for things that are kept inside a Python class to be Send. Luckily for us almost everything in Tantivy is Send, except for the Query trait. This patch works around this by keeping the parser and query string inside our python Query object. This sadly means that we are going to parse the query string twice. 2020-09-19 09:36:31 +00:00			`Err(e) => return Err(PyValueError::new_err(e.to_string())),`
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`}`
			`}`
			`};`

			`let count = match count_handle {`
			`Some(h) => Some(h.extract(&mut multifruit)),`
searcher: Remove the ability to order the search result. Ordering the search result by a field requires the field to be set up to support this at the index creation time. If it wasn't properly set up, such a search would crash the Python interpreter. Until a search returns an error that we can convert to a Python exception this feature will unlikely be supported. 2019-12-17 22:17:44 +00:00			`None => None,`
searcher: Allow the search to be sorted by an unsigned field. 2019-10-01 18:56:42 +00:00			`};`
searcher: Remove the collector concept. This patch removes the TopDocs collector class and adds a limit argument on the search method. 2019-10-01 18:05:38 +00:00
searcher: Use a search result struct. 2019-12-17 19:50:10 +00:00			`Ok(SearchResult { hits, count })`
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`}`

			`/// Returns the overall number of documents in the index.`
			`#[getter]`
			`fn num_docs(&self) -> u64 {`
			`self.inner.num_docs()`
			`}`

			`/// Fetches a document from Tantivy's store given a DocAddress.`
			`///`
			`/// Args:`
			`/// doc_address (DocAddress): The DocAddress that is associated with`
			`/// the document that we wish to fetch.`
			`///`
			`/// Returns the Document, raises ValueError if the document can't be found.`
			`fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {`
API simplification. See #9 2019-08-02 11:23:10 +00:00			`let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?;`
			`let named_doc = self.inner.schema().to_named_doc(&doc);`
			`Ok(Document {`
			`field_values: named_doc.0,`
			`})`
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`}`
			`}`

			`/// DocAddress contains all the necessary information to identify a document`
			`/// given a Searcher object.`
			`///`
			`/// It consists in an id identifying its segment, and its segment-local DocId.`
			`/// The id used for the segment is actually an ordinal in the list of segment`
			`/// hold by a Searcher.`
			`#[pyclass]`
searcher: Add support to search and order the results by a field. 2020-04-19 10:26:08 +00:00			`#[derive(Clone, Debug)]`
Initial python bindings implementation. 2019-06-04 09:09:58 +00:00			`pub(crate) struct DocAddress {`
			`pub(crate) segment_ord: tv::SegmentLocalId,`
			`pub(crate) doc: tv::DocId,`
			`}`

			`#[pymethods]`
			`impl DocAddress {`
			`/// The segment ordinal is an id identifying the segment hosting the`
			`/// document. It is only meaningful, in the context of a searcher.`
			`#[getter]`
			`fn segment_ord(&self) -> u32 {`
			`self.segment_ord`
			`}`

			`/// The segment local DocId`
			`#[getter]`
			`fn doc(&self) -> u32 {`
			`self.doc`
			`}`
			`}`

			`impl From<&tv::DocAddress> for DocAddress {`
			`fn from(doc_address: &tv::DocAddress) -> Self {`
			`DocAddress {`
			`segment_ord: doc_address.segment_ord(),`
			`doc: doc_address.doc(),`
			`}`
			`}`
			`}`

			`impl Into<tv::DocAddress> for &DocAddress {`
			`fn into(self) -> tv::DocAddress {`
			`tv::DocAddress(self.segment_ord(), self.doc())`
			`}`
			`}`

API simplification. See #9 2019-08-02 11:23:10 +00:00			`#[pyproto]`
			`impl PyObjectProtocol for Searcher {`
			`fn __repr__(&self) -> PyResult<String> {`
			`Ok(format!(`
			`"Searcher(num_docs={}, num_segments={})",`
			`self.inner.num_docs(),`
			`self.inner.segment_readers().len()`
			`))`
			`}`
			`}`