2019-08-08 11:55:40 +00:00
|
|
|
#![allow(clippy::new_ret_no_self)]
|
|
|
|
|
2019-06-04 09:09:58 +00:00
|
|
|
use crate::document::Document;
|
|
|
|
use crate::query::Query;
|
2019-12-17 22:17:44 +00:00
|
|
|
use crate::{to_pyerr};
|
|
|
|
use pyo3::exceptions::ValueError;
|
2019-08-02 11:23:10 +00:00
|
|
|
use pyo3::prelude::*;
|
2019-10-01 18:56:42 +00:00
|
|
|
use pyo3::PyObjectProtocol;
|
2019-08-02 11:23:10 +00:00
|
|
|
use tantivy as tv;
|
2019-12-17 22:17:44 +00:00
|
|
|
use tantivy::collector::{Count, MultiCollector, TopDocs};
|
2019-06-04 09:09:58 +00:00
|
|
|
|
|
|
|
/// Tantivy's Searcher class
|
|
|
|
///
|
|
|
|
/// A Searcher is used to search the index given a prepared Query.
|
|
|
|
#[pyclass]
|
|
|
|
pub(crate) struct Searcher {
|
|
|
|
pub(crate) inner: tv::LeasedItem<tv::Searcher>,
|
|
|
|
}
|
|
|
|
|
2019-12-17 19:50:10 +00:00
|
|
|
#[pyclass]
|
2019-12-17 22:17:44 +00:00
|
|
|
/// Object holding a results successful search.
|
2019-12-17 19:50:10 +00:00
|
|
|
pub(crate) struct SearchResult {
|
2019-12-17 22:17:44 +00:00
|
|
|
hits: Vec<(PyObject, DocAddress)>,
|
|
|
|
#[pyo3(get)]
|
|
|
|
/// How many documents matched the query. Only available if `count` was set
|
|
|
|
/// to true during the search.
|
|
|
|
count: Option<usize>,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[pymethods]
|
|
|
|
impl SearchResult {
|
|
|
|
#[getter]
|
|
|
|
/// The list of tuples that contains the scores and DocAddress of the
|
|
|
|
/// search results.
|
|
|
|
fn hits(&self, py: Python) -> PyResult<Vec<(PyObject, DocAddress)>> {
|
|
|
|
let ret: Vec<(PyObject, DocAddress)> = self
|
|
|
|
.hits
|
|
|
|
.iter()
|
|
|
|
.map(|(obj, address)| (obj.clone_ref(py), address.clone()))
|
|
|
|
.collect();
|
|
|
|
Ok(ret)
|
|
|
|
}
|
2019-12-17 19:50:10 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 09:09:58 +00:00
|
|
|
#[pymethods]
|
|
|
|
impl Searcher {
|
|
|
|
/// Search the index with the given query and collect results.
|
|
|
|
///
|
|
|
|
/// Args:
|
|
|
|
/// query (Query): The query that will be used for the search.
|
2019-12-17 22:17:44 +00:00
|
|
|
/// limit (int, optional): The maximum number of search results to
|
|
|
|
/// return. Defaults to 10.
|
|
|
|
/// count (bool, optional): Should the number of documents that match
|
|
|
|
/// the query be returned as well. Defaults to true.
|
2019-06-04 09:09:58 +00:00
|
|
|
///
|
2019-12-17 22:17:44 +00:00
|
|
|
/// Returns `SearchResult` object.
|
2019-06-04 09:09:58 +00:00
|
|
|
///
|
|
|
|
/// Raises a ValueError if there was an error with the search.
|
2019-12-17 22:17:44 +00:00
|
|
|
#[args(limit = 10, count = true)]
|
2019-06-04 09:09:58 +00:00
|
|
|
fn search(
|
|
|
|
&self,
|
2019-10-01 18:05:38 +00:00
|
|
|
py: Python,
|
2019-06-04 09:09:58 +00:00
|
|
|
query: &Query,
|
2019-10-01 18:05:38 +00:00
|
|
|
limit: usize,
|
2019-12-17 19:50:10 +00:00
|
|
|
count: bool,
|
|
|
|
) -> PyResult<SearchResult> {
|
2019-12-17 22:17:44 +00:00
|
|
|
let mut multicollector = MultiCollector::new();
|
2019-12-17 19:50:10 +00:00
|
|
|
|
|
|
|
let count_handle = if count {
|
|
|
|
Some(multicollector.add_collector(Count))
|
2019-10-01 18:56:42 +00:00
|
|
|
} else {
|
2019-12-17 19:50:10 +00:00
|
|
|
None
|
|
|
|
};
|
|
|
|
|
2019-12-17 22:17:44 +00:00
|
|
|
let (mut multifruit, hits) = {
|
|
|
|
let collector = TopDocs::with_limit(limit);
|
|
|
|
let top_docs_handle = multicollector.add_collector(collector);
|
|
|
|
let ret = self.inner.search(&query.inner, &multicollector);
|
|
|
|
|
|
|
|
match ret {
|
|
|
|
Ok(mut r) => {
|
|
|
|
let top_docs = top_docs_handle.extract(&mut r);
|
|
|
|
let result: Vec<(PyObject, DocAddress)> = top_docs
|
|
|
|
.iter()
|
|
|
|
.map(|(f, d)| ((*f).into_py(py), DocAddress::from(d)))
|
|
|
|
.collect();
|
|
|
|
(r, result)
|
2019-12-17 19:50:10 +00:00
|
|
|
}
|
2019-12-17 22:17:44 +00:00
|
|
|
Err(e) => return Err(ValueError::py_err(e.to_string())),
|
2019-12-17 19:50:10 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
let count = match count_handle {
|
|
|
|
Some(h) => Some(h.extract(&mut multifruit)),
|
2019-12-17 22:17:44 +00:00
|
|
|
None => None,
|
2019-10-01 18:56:42 +00:00
|
|
|
};
|
2019-10-01 18:05:38 +00:00
|
|
|
|
2019-12-17 19:50:10 +00:00
|
|
|
Ok(SearchResult { hits, count })
|
2019-06-04 09:09:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the overall number of documents in the index.
|
|
|
|
#[getter]
|
|
|
|
fn num_docs(&self) -> u64 {
|
|
|
|
self.inner.num_docs()
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Fetches a document from Tantivy's store given a DocAddress.
|
|
|
|
///
|
|
|
|
/// Args:
|
|
|
|
/// doc_address (DocAddress): The DocAddress that is associated with
|
|
|
|
/// the document that we wish to fetch.
|
|
|
|
///
|
|
|
|
/// Returns the Document, raises ValueError if the document can't be found.
|
|
|
|
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
|
2019-08-02 11:23:10 +00:00
|
|
|
let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
|
|
|
|
let named_doc = self.inner.schema().to_named_doc(&doc);
|
|
|
|
Ok(Document {
|
|
|
|
field_values: named_doc.0,
|
|
|
|
})
|
2019-06-04 09:09:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// DocAddress contains all the necessary information to identify a document
|
|
|
|
/// given a Searcher object.
|
|
|
|
///
|
|
|
|
/// It consists in an id identifying its segment, and its segment-local DocId.
|
|
|
|
/// The id used for the segment is actually an ordinal in the list of segment
|
|
|
|
/// hold by a Searcher.
|
|
|
|
#[pyclass]
|
2019-12-17 22:17:44 +00:00
|
|
|
#[derive(Clone)]
|
2019-06-04 09:09:58 +00:00
|
|
|
pub(crate) struct DocAddress {
|
|
|
|
pub(crate) segment_ord: tv::SegmentLocalId,
|
|
|
|
pub(crate) doc: tv::DocId,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[pymethods]
|
|
|
|
impl DocAddress {
|
|
|
|
/// The segment ordinal is an id identifying the segment hosting the
|
|
|
|
/// document. It is only meaningful, in the context of a searcher.
|
|
|
|
#[getter]
|
|
|
|
fn segment_ord(&self) -> u32 {
|
|
|
|
self.segment_ord
|
|
|
|
}
|
|
|
|
|
|
|
|
/// The segment local DocId
|
|
|
|
#[getter]
|
|
|
|
fn doc(&self) -> u32 {
|
|
|
|
self.doc
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<&tv::DocAddress> for DocAddress {
|
|
|
|
fn from(doc_address: &tv::DocAddress) -> Self {
|
|
|
|
DocAddress {
|
|
|
|
segment_ord: doc_address.segment_ord(),
|
|
|
|
doc: doc_address.doc(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Into<tv::DocAddress> for &DocAddress {
|
|
|
|
fn into(self) -> tv::DocAddress {
|
|
|
|
tv::DocAddress(self.segment_ord(), self.doc())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-02 11:23:10 +00:00
|
|
|
#[pyproto]
|
|
|
|
impl PyObjectProtocol for Searcher {
|
|
|
|
fn __repr__(&self) -> PyResult<String> {
|
|
|
|
Ok(format!(
|
|
|
|
"Searcher(num_docs={}, num_segments={})",
|
|
|
|
self.inner.num_docs(),
|
|
|
|
self.inner.segment_readers().len()
|
|
|
|
))
|
|
|
|
}
|
|
|
|
}
|