commit
f57c4669c8
|
@ -11,7 +11,8 @@ crate-type = ["cdylib"]
|
|||
|
||||
[dependencies]
|
||||
chrono = "0.4"
|
||||
tantivy = { git = "https://github.com/tantivy-search/tantivy" }
|
||||
tantivy = { git = "https://github.com/tantivy-search/tantivy", branch = "master" }
|
||||
itertools = "0.8"
|
||||
|
||||
[dependencies.pyo3]
|
||||
version = "0.7.0"
|
||||
|
|
50
README.md
50
README.md
|
@ -24,35 +24,37 @@ needs to be built. After that documents can be added to the index and a reader
|
|||
can be created to search the index.
|
||||
|
||||
```python
|
||||
builder = tantivy.SchemaBuilder()
|
||||
import tantivy
|
||||
|
||||
title = builder.add_text_field("title", stored=True)
|
||||
body = builder.add_text_field("body")
|
||||
# Declaring our schema.
|
||||
schema_builder = tantivy.SchemaBuilder()
|
||||
schema_builder.add_text_field("title", stored=True)
|
||||
schema_builder.add_text_field("body", stored=True)
|
||||
schema = schema_builder.build()
|
||||
|
||||
schema = builder.build()
|
||||
index = tantivy.Index(schema)
|
||||
# Creating our index (in memory, but filesystem is available too)
|
||||
index = tantivy.Index(schema)
|
||||
|
||||
writer = index.writer()
|
||||
|
||||
doc = tantivy.Document()
|
||||
doc.add_text(title, "The Old Man and the Sea")
|
||||
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."))
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
# Adding one document.
|
||||
writer = index.writer()
|
||||
writer.add_document({
|
||||
"title": "The Old Man and the Sea",
|
||||
"body": """He was an old man who fished alone in a skiff in
|
||||
the Gulf Stream and he had gone eighty-four days
|
||||
now without taking a fish."""
|
||||
})
|
||||
# ... and committing
|
||||
writer.commit()
|
||||
|
||||
reader = index.reader()
|
||||
searcher = reader.searcher()
|
||||
|
||||
query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
||||
query = query_parser.parse_query("sea whale")
|
||||
# Reload the index to ensure it points to the last commit.
|
||||
index.reload();
|
||||
searcher = index.searcher()
|
||||
query = index.parse_query("sea whale", ["title", "body"])
|
||||
top_docs = tantivy.TopDocs(3)
|
||||
|
||||
top_docs = tantivy.TopDocs(10)
|
||||
result = searcher.search(query, top_docs)
|
||||
|
||||
_, doc_address = result[0]
|
||||
|
||||
searched_doc = searcher.doc(doc_address)
|
||||
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
||||
(best_score, best_doc_address) = searcher.search(query, nhits=3)[0]
|
||||
best_doc = searcher.doc(best_doc_address)
|
||||
assert best_doc["title"] == ["The Old Man and the Sea"]
|
||||
```
|
||||
|
|
307
src/document.rs
307
src/document.rs
|
@ -1,75 +1,255 @@
|
|||
#![allow(clippy::new_ret_no_self)]
|
||||
#![allow(clippy::wrong_self_convention)]
|
||||
|
||||
use itertools::Itertools;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyDateTime;
|
||||
use pyo3::types::{PyAny, PyDateTime, PyDict, PyList, PyTuple};
|
||||
use pyo3::types::{PyDateAccess, PyTimeAccess};
|
||||
|
||||
use chrono::offset::TimeZone;
|
||||
use chrono::Utc;
|
||||
use chrono::{Datelike, Timelike, Utc};
|
||||
|
||||
use tantivy as tv;
|
||||
|
||||
use crate::facet::Facet;
|
||||
use crate::field::{Field, FieldValue};
|
||||
use crate::to_pyerr;
|
||||
use pyo3::{PyMappingProtocol, PyObjectProtocol};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use tantivy::schema::Value;
|
||||
|
||||
fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
|
||||
Ok(match value {
|
||||
Value::Str(text) => text.into_object(py),
|
||||
Value::U64(num) => num.into_object(py),
|
||||
Value::I64(num) => num.into_object(py),
|
||||
Value::F64(num) => num.into_object(py),
|
||||
Value::Bytes(b) => b.to_object(py),
|
||||
Value::Date(d) => PyDateTime::new(
|
||||
py,
|
||||
d.year(),
|
||||
d.month() as u8,
|
||||
d.day() as u8,
|
||||
d.hour() as u8,
|
||||
d.minute() as u8,
|
||||
d.second() as u8,
|
||||
d.timestamp_subsec_micros(),
|
||||
None,
|
||||
)?
|
||||
.into_object(py),
|
||||
Value::Facet(f) => Facet { inner: f.clone() }.into_object(py),
|
||||
})
|
||||
}
|
||||
|
||||
fn value_to_string(value: &Value) -> String {
|
||||
match value {
|
||||
Value::Str(text) => text.clone(),
|
||||
Value::U64(num) => format!("{}", num),
|
||||
Value::I64(num) => format!("{}", num),
|
||||
Value::F64(num) => format!("{}", num),
|
||||
Value::Bytes(bytes) => format!("{:?}", bytes),
|
||||
Value::Date(d) => format!("{:?}", d),
|
||||
Value::Facet(facet) => facet.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Tantivy's Document is the object that can be indexed and then searched for.
|
||||
///
|
||||
/// Documents are fundamentally a collection of unordered tuples
|
||||
/// (field, value). In this list, one field may appear more than once.
|
||||
/// (field_name, value). In this list, one field may appear more than once.
|
||||
///
|
||||
/// Example:
|
||||
/// >>> doc = tantivy.Document()
|
||||
/// >>> doc.add_text(title, "The Old Man and the Sea")
|
||||
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
|
||||
/// >>> doc.add_text("title", "The Old Man and the Sea")
|
||||
/// >>> doc.add_text("body", ("He was an old man who fished alone in a "
|
||||
/// "skiff in the Gulf Stream and he had gone "
|
||||
/// "eighty-four days now without taking a fish."))
|
||||
///
|
||||
/// For simplicity, it is also possible to build a `Document` by passing the field
|
||||
/// values directly as constructor arguments.
|
||||
///
|
||||
/// Example:
|
||||
/// >>> doc = tantivy.Document(title=["The Old Man and the Sea"], body=["..."])
|
||||
///
|
||||
/// As syntactic sugar, tantivy also allows the user to pass a single values
|
||||
/// if there is only one. In other words, the following is also legal.
|
||||
///
|
||||
/// Example:
|
||||
/// >>> doc = tantivy.Document(title="The Old Man and the Sea", body="...")
|
||||
|
||||
#[pyclass]
|
||||
#[derive(Default)]
|
||||
pub(crate) struct Document {
|
||||
pub(crate) inner: tv::Document,
|
||||
pub(crate) field_values: BTreeMap<String, Vec<tv::schema::Value>>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Document {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let doc_str = self
|
||||
.field_values
|
||||
.iter()
|
||||
.map(|(field_name, field_values)| {
|
||||
let mut values_str =
|
||||
field_values.iter().map(value_to_string).join(",");
|
||||
values_str.truncate(10);
|
||||
format!("{}=[{}]", field_name, values_str)
|
||||
})
|
||||
.join(",");
|
||||
write!(f, "Document({})", doc_str)
|
||||
}
|
||||
}
|
||||
|
||||
fn add_value<T>(doc: &mut Document, field_name: String, value: T)
|
||||
where
|
||||
Value: From<T>,
|
||||
{
|
||||
doc.field_values
|
||||
.entry(field_name)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(Value::from(value));
|
||||
}
|
||||
|
||||
fn extract_value(any: &PyAny) -> PyResult<Value> {
|
||||
if let Ok(s) = any.extract::<String>() {
|
||||
return Ok(Value::Str(s));
|
||||
}
|
||||
if let Ok(num) = any.extract::<i64>() {
|
||||
return Ok(Value::I64(num));
|
||||
}
|
||||
if let Ok(num) = any.extract::<f64>() {
|
||||
return Ok(Value::F64(num));
|
||||
}
|
||||
if let Ok(py_datetime) = any.downcast_ref::<PyDateTime>() {
|
||||
let datetime = Utc
|
||||
.ymd(
|
||||
py_datetime.get_year(),
|
||||
py_datetime.get_month().into(),
|
||||
py_datetime.get_day().into(),
|
||||
)
|
||||
.and_hms_micro(
|
||||
py_datetime.get_hour().into(),
|
||||
py_datetime.get_minute().into(),
|
||||
py_datetime.get_second().into(),
|
||||
py_datetime.get_microsecond(),
|
||||
);
|
||||
return Ok(Value::Date(datetime));
|
||||
}
|
||||
if let Ok(facet) = any.downcast_ref::<Facet>() {
|
||||
return Ok(Value::Facet(facet.inner.clone()));
|
||||
}
|
||||
Err(to_pyerr(format!("Value unsupported {:?}", any)))
|
||||
}
|
||||
|
||||
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
|
||||
if let Ok(values) = any.downcast_ref::<PyList>() {
|
||||
values.iter().map(extract_value).collect()
|
||||
} else {
|
||||
Ok(vec![extract_value(any)?])
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Document {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) {
|
||||
obj.init(Document {
|
||||
inner: tv::Document::default(),
|
||||
});
|
||||
#[args(kwargs = "**")]
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut document = Document::default();
|
||||
if let Some(field_dict) = kwargs {
|
||||
document.extend(field_dict)?;
|
||||
}
|
||||
obj.init(document);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn extend(&mut self, py_dict: &PyDict) -> PyResult<()> {
|
||||
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
|
||||
BTreeMap::new();
|
||||
for key_value_any in py_dict.items() {
|
||||
if let Ok(key_value) = key_value_any.downcast_ref::<PyTuple>() {
|
||||
if key_value.len() != 2 {
|
||||
continue;
|
||||
}
|
||||
let key: String = key_value.get_item(0).extract()?;
|
||||
let value_list =
|
||||
extract_value_single_or_list(key_value.get_item(1))?;
|
||||
field_values.insert(key, value_list);
|
||||
}
|
||||
}
|
||||
self.field_values.extend(field_values.into_iter());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
fn from_dict(py_dict: &PyDict) -> PyResult<Document> {
|
||||
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
|
||||
BTreeMap::new();
|
||||
for key_value_any in py_dict.items() {
|
||||
if let Ok(key_value) = key_value_any.downcast_ref::<PyTuple>() {
|
||||
if key_value.len() != 2 {
|
||||
continue;
|
||||
}
|
||||
let key: String = key_value.get_item(0).extract()?;
|
||||
let value_list =
|
||||
extract_value_single_or_list(key_value.get_item(1))?;
|
||||
field_values.insert(key, value_list);
|
||||
}
|
||||
}
|
||||
Ok(Document { field_values })
|
||||
}
|
||||
|
||||
/// Returns a dictionary with the different
|
||||
/// field values.
|
||||
///
|
||||
/// In tantivy, `Document` can be hold multiple
|
||||
/// values for a single field.
|
||||
///
|
||||
/// For this reason, the dictionary, will associate
|
||||
/// a list of value for every field.
|
||||
fn to_dict(&self, py: Python) -> PyResult<PyObject> {
|
||||
let dict = PyDict::new(py);
|
||||
for (key, values) in &self.field_values {
|
||||
let values_py: Vec<PyObject> = values
|
||||
.iter()
|
||||
.map(|v| value_to_py(py, v))
|
||||
.collect::<PyResult<_>>()?;
|
||||
dict.set_item(key, values_py)?;
|
||||
}
|
||||
Ok(dict.into())
|
||||
}
|
||||
|
||||
/// Add a text value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the text.
|
||||
/// field_name (str): The field name for which we are adding the text.
|
||||
/// text (str): The text that will be added to the document.
|
||||
fn add_text(&mut self, field: &Field, text: &str) {
|
||||
self.inner.add_text(field.inner, text);
|
||||
fn add_text(&mut self, field_name: String, text: &str) {
|
||||
add_value(self, field_name, text);
|
||||
}
|
||||
|
||||
/// Add an unsigned integer value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the integer.
|
||||
/// field_name (str): The field name for which we are adding the unsigned integer.
|
||||
/// value (int): The integer that will be added to the document.
|
||||
fn add_unsigned(&mut self, field: &Field, value: u64) {
|
||||
self.inner.add_u64(field.inner, value);
|
||||
fn add_unsigned(&mut self, field_name: String, value: u64) {
|
||||
add_value(self, field_name, value);
|
||||
}
|
||||
|
||||
/// Add a signed integer value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the integer.
|
||||
/// field_name (str): The field name for which we are adding the integer.
|
||||
/// value (int): The integer that will be added to the document.
|
||||
fn add_integer(&mut self, field: &Field, value: i64) {
|
||||
self.inner.add_i64(field.inner, value);
|
||||
fn add_integer(&mut self, field_name: String, value: i64) {
|
||||
add_value(self, field_name, value);
|
||||
}
|
||||
|
||||
/// Add a date value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the integer.
|
||||
/// field_name (str): The field name for which we are adding the date.
|
||||
/// value (datetime): The date that will be added to the document.
|
||||
fn add_date(&mut self, field: &Field, value: &PyDateTime) {
|
||||
fn add_date(&mut self, field_name: String, value: &PyDateTime) {
|
||||
let datetime = Utc
|
||||
.ymd(
|
||||
value.get_year(),
|
||||
|
@ -82,37 +262,36 @@ impl Document {
|
|||
value.get_second().into(),
|
||||
value.get_microsecond(),
|
||||
);
|
||||
|
||||
self.inner.add_date(field.inner, &datetime);
|
||||
add_value(self, field_name, datetime);
|
||||
}
|
||||
|
||||
/// Add a facet value to the document.
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the facet.
|
||||
/// field_name (str): The field name for which we are adding the facet.
|
||||
/// value (Facet): The Facet that will be added to the document.
|
||||
fn add_facet(&mut self, field: &Field, value: &Facet) {
|
||||
self.inner.add_facet(field.inner, value.inner.clone());
|
||||
fn add_facet(&mut self, field_name: String, facet: &Facet) {
|
||||
add_value(self, field_name, facet.inner.clone());
|
||||
}
|
||||
|
||||
/// Add a bytes value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the bytes.
|
||||
/// field_name (str): The field for which we are adding the bytes.
|
||||
/// value (bytes): The bytes that will be added to the document.
|
||||
fn add_bytes(&mut self, field: &Field, value: Vec<u8>) {
|
||||
self.inner.add_bytes(field.inner, value);
|
||||
fn add_bytes(&mut self, field_name: String, bytes: Vec<u8>) {
|
||||
add_value(self, field_name, bytes);
|
||||
}
|
||||
|
||||
/// Returns the number of added fields that have been added to the document
|
||||
#[getter]
|
||||
fn len(&self) -> usize {
|
||||
self.inner.len()
|
||||
fn num_fields(&self) -> usize {
|
||||
self.field_values.len()
|
||||
}
|
||||
|
||||
/// True if the document is empty, False otherwise.
|
||||
#[getter]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.inner.is_empty()
|
||||
self.field_values.is_empty()
|
||||
}
|
||||
|
||||
/// Get the first value associated with the given field.
|
||||
|
@ -122,9 +301,17 @@ impl Document {
|
|||
///
|
||||
/// Returns the value if one is found, otherwise None.
|
||||
/// The type of the value depends on the field.
|
||||
fn get_first(&self, py: Python, field: &Field) -> Option<PyObject> {
|
||||
let value = self.inner.get_first(field.inner)?;
|
||||
FieldValue::value_to_py(py, value)
|
||||
fn get_first(
|
||||
&self,
|
||||
py: Python,
|
||||
fieldname: &str,
|
||||
) -> PyResult<Option<PyObject>> {
|
||||
if let Some(value) = self.iter_values_for_field(fieldname).next() {
|
||||
let py_value = value_to_py(py, value)?;
|
||||
Ok(Some(py_value))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the all values associated with the given field.
|
||||
|
@ -134,21 +321,37 @@ impl Document {
|
|||
///
|
||||
/// Returns a list of values.
|
||||
/// The type of the value depends on the field.
|
||||
fn get_all(&self, py: Python, field: &Field) -> Vec<PyObject> {
|
||||
let values = self.inner.get_all(field.inner);
|
||||
values
|
||||
.iter()
|
||||
.map(|&v| FieldValue::value_to_py(py, v))
|
||||
.filter_map(|x| x)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get all the fields and values contained in the document.
|
||||
fn field_values(&self, py: Python) -> Vec<FieldValue> {
|
||||
let field_values = self.inner.field_values();
|
||||
field_values
|
||||
.iter()
|
||||
.map(|v| FieldValue::field_value_to_py(py, v))
|
||||
.collect()
|
||||
fn get_all(&self, py: Python, field_name: &str) -> PyResult<Vec<PyObject>> {
|
||||
self.iter_values_for_field(field_name)
|
||||
.map(|value| value_to_py(py, value))
|
||||
.collect::<PyResult<Vec<_>>>()
|
||||
}
|
||||
}
|
||||
|
||||
impl Document {
|
||||
fn iter_values_for_field<'a>(
|
||||
&'a self,
|
||||
field: &str,
|
||||
) -> impl Iterator<Item = &'a Value> + 'a {
|
||||
self.field_values
|
||||
.get(field)
|
||||
.into_iter()
|
||||
.flat_map(|values| values.iter())
|
||||
}
|
||||
}
|
||||
|
||||
#[pyproto]
|
||||
impl PyMappingProtocol for Document {
|
||||
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
self.get_all(py, field_name)
|
||||
}
|
||||
}
|
||||
|
||||
#[pyproto]
|
||||
impl PyObjectProtocol for Document {
|
||||
fn __repr__(&self) -> PyResult<String> {
|
||||
Ok(format!("{:?}", self))
|
||||
}
|
||||
}
|
||||
|
|
21
src/facet.rs
21
src/facet.rs
|
@ -1,6 +1,6 @@
|
|||
use pyo3::basic::PyObjectProtocol;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyType;
|
||||
|
||||
use tantivy::schema;
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
|
@ -52,4 +52,23 @@ impl Facet {
|
|||
inner: schema::Facet::from_text(facet_string),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the list of `segments` that forms a facet path.
|
||||
///
|
||||
/// For instance `//europe/france` becomes `["europe", "france"]`.
|
||||
fn to_path(&self) -> Vec<&str> {
|
||||
self.inner.to_path()
|
||||
}
|
||||
|
||||
/// Returns the facet string representation.
|
||||
fn to_path_str(&self) -> String {
|
||||
self.inner.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[pyproto]
|
||||
impl PyObjectProtocol for Facet {
|
||||
fn __repr__(&self) -> PyResult<String> {
|
||||
Ok(format!("Facet({})", self.to_path_str()))
|
||||
}
|
||||
}
|
||||
|
|
73
src/field.rs
73
src/field.rs
|
@ -1,73 +0,0 @@
|
|||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyDateTime;
|
||||
|
||||
use tantivy::schema;
|
||||
|
||||
use crate::facet::Facet;
|
||||
|
||||
/// Field is a numeric indentifier that represents an entry in the Schema.
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Field {
|
||||
pub(crate) inner: schema::Field,
|
||||
}
|
||||
|
||||
/// FieldValue holds together a Field and its Value.
|
||||
#[pyclass]
|
||||
pub(crate) struct FieldValue {
|
||||
pub(crate) field: Field,
|
||||
pub(crate) value: PyObject,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl FieldValue {
|
||||
#[getter]
|
||||
fn field(&self) -> Field {
|
||||
self.field.clone()
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn value(&self) -> &PyObject {
|
||||
&self.value
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldValue {
|
||||
pub(crate) fn value_to_py(
|
||||
py: Python,
|
||||
value: &schema::Value,
|
||||
) -> Option<PyObject> {
|
||||
match value {
|
||||
schema::Value::Str(text) => Some(text.into_object(py)),
|
||||
schema::Value::U64(num) => Some(num.into_object(py)),
|
||||
schema::Value::I64(num) => Some(num.into_object(py)),
|
||||
schema::Value::F64(num) => Some(num.into_object(py)),
|
||||
schema::Value::Bytes(b) => Some(b.to_object(py)),
|
||||
schema::Value::Date(d) => {
|
||||
let date =
|
||||
PyDateTime::from_timestamp(py, d.timestamp() as f64, None);
|
||||
|
||||
match date {
|
||||
Ok(d) => Some(d.into_object(py)),
|
||||
Err(_e) => None,
|
||||
}
|
||||
}
|
||||
schema::Value::Facet(f) => {
|
||||
Some(Facet { inner: f.clone() }.into_object(py))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn field_value_to_py(
|
||||
py: Python,
|
||||
field_value: &schema::FieldValue,
|
||||
) -> FieldValue {
|
||||
let value = field_value.value();
|
||||
let field = field_value.field();
|
||||
|
||||
FieldValue {
|
||||
field: Field { inner: field },
|
||||
value: FieldValue::value_to_py(py, value).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
253
src/index.rs
253
src/index.rs
|
@ -4,60 +4,24 @@ use pyo3::exceptions;
|
|||
use pyo3::prelude::*;
|
||||
|
||||
use crate::document::Document;
|
||||
use crate::query::Query;
|
||||
use crate::schema::Schema;
|
||||
use crate::searcher::Searcher;
|
||||
use crate::to_pyerr;
|
||||
use tantivy as tv;
|
||||
use tantivy::directory::MmapDirectory;
|
||||
use tantivy::schema::{Field, NamedFieldDocument};
|
||||
|
||||
const RELOAD_POLICY: &str = "commit";
|
||||
|
||||
/// IndexReader is the entry point to read and search the index.
|
||||
///
|
||||
/// IndexReader controls when a new version of the index should be loaded and
|
||||
/// lends you instances of Searcher for the last loaded version.
|
||||
///
|
||||
/// To create an IndexReader first create an Index and call the reader() method
|
||||
/// on the index object.
|
||||
#[pyclass]
|
||||
pub(crate) struct IndexReader {
|
||||
inner: tv::IndexReader,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl IndexReader {
|
||||
/// Update searchers so that they reflect the state of the last .commit().
|
||||
///
|
||||
/// If you set up the the reload policy to be on 'commit' (which is the
|
||||
/// default) every commit should be rapidly reflected on your IndexReader
|
||||
/// and you should not need to call reload() at all.
|
||||
fn reload(&self) -> PyResult<()> {
|
||||
let ret = self.inner.reload();
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a Searcher for the index.
|
||||
///
|
||||
/// This method should be called every single time a search query is
|
||||
/// performed. The searchers are taken from a pool of num_searchers
|
||||
/// searchers.
|
||||
///
|
||||
/// Returns a Searcher object, if no searcher is available this may block.
|
||||
fn searcher(&self) -> Searcher {
|
||||
let searcher = self.inner.searcher();
|
||||
Searcher { inner: searcher }
|
||||
}
|
||||
}
|
||||
|
||||
/// IndexWriter is the user entry-point to add documents to the index.
|
||||
///
|
||||
/// To create an IndexWriter first create an Index and call the writer() method
|
||||
/// on the index object.
|
||||
#[pyclass]
|
||||
pub(crate) struct IndexWriter {
|
||||
inner: tv::IndexWriter,
|
||||
inner_index_writer: tv::IndexWriter,
|
||||
schema: tv::schema::Schema,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
|
@ -70,9 +34,24 @@ impl IndexWriter {
|
|||
/// by the client to align commits with its own document queue.
|
||||
/// The `opstamp` represents the number of documents that have been added
|
||||
/// since the creation of the index.
|
||||
fn add_document(&mut self, document: &Document) -> PyResult<()> {
|
||||
self.inner.add_document(document.inner.clone());
|
||||
Ok(())
|
||||
pub fn add_document(&mut self, doc: &Document) -> PyResult<u64> {
|
||||
let named_doc = NamedFieldDocument(doc.field_values.clone());
|
||||
let doc = self.schema.convert_named_doc(named_doc).map_err(to_pyerr)?;
|
||||
Ok(self.inner_index_writer.add_document(doc))
|
||||
}
|
||||
|
||||
/// Helper for the `add_document` method, but passing a json string.
|
||||
///
|
||||
/// If the indexing pipeline is full, this call may block.
|
||||
///
|
||||
/// Returns an `opstamp`, which is an increasing integer that can be used
|
||||
/// by the client to align commits with its own document queue.
|
||||
/// The `opstamp` represents the number of documents that have been added
|
||||
/// since the creation of the index.
|
||||
pub fn add_json(&mut self, json: &str) -> PyResult<u64> {
|
||||
let doc = self.schema.parse_document(json).map_err(to_pyerr)?;
|
||||
let opstamp = self.inner_index_writer.add_document(doc);
|
||||
Ok(opstamp)
|
||||
}
|
||||
|
||||
/// Commits all of the pending changes
|
||||
|
@ -84,12 +63,8 @@ impl IndexWriter {
|
|||
/// spared), it will be possible to resume indexing from this point.
|
||||
///
|
||||
/// Returns the `opstamp` of the last document that made it in the commit.
|
||||
fn commit(&mut self) -> PyResult<()> {
|
||||
let ret = self.inner.commit();
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
fn commit(&mut self) -> PyResult<u64> {
|
||||
self.inner_index_writer.commit().map_err(to_pyerr)
|
||||
}
|
||||
|
||||
/// Rollback to the last commit
|
||||
|
@ -97,23 +72,15 @@ impl IndexWriter {
|
|||
/// This cancels all of the update that happened before after the last
|
||||
/// commit. After calling rollback, the index is in the same state as it
|
||||
/// was after the last commit.
|
||||
fn rollback(&mut self) -> PyResult<()> {
|
||||
let ret = self.inner.rollback();
|
||||
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
fn rollback(&mut self) -> PyResult<u64> {
|
||||
self.inner_index_writer.rollback().map_err(to_pyerr)
|
||||
}
|
||||
|
||||
/// Detect and removes the files that are not used by the index anymore.
|
||||
fn garbage_collect_files(&mut self) -> PyResult<()> {
|
||||
let ret = self.inner.garbage_collect_files();
|
||||
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
self.inner_index_writer
|
||||
.garbage_collect_files()
|
||||
.map_err(to_pyerr)
|
||||
}
|
||||
|
||||
/// The opstamp of the last successful commit.
|
||||
|
@ -125,7 +92,7 @@ impl IndexWriter {
|
|||
/// for searchers.
|
||||
#[getter]
|
||||
fn commit_opstamp(&self) -> u64 {
|
||||
self.inner.commit_opstamp()
|
||||
self.inner_index_writer.commit_opstamp()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -142,11 +109,19 @@ impl IndexWriter {
|
|||
/// if there was a problem during the opening or creation of the index.
|
||||
#[pyclass]
|
||||
pub(crate) struct Index {
|
||||
pub(crate) inner: tv::Index,
|
||||
pub(crate) index: tv::Index,
|
||||
reader: tv::IndexReader,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Index {
|
||||
#[staticmethod]
|
||||
fn open(path: &str) -> PyResult<Index> {
|
||||
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
|
||||
let reader = index.reader().map_err(to_pyerr)?;
|
||||
Ok(Index { index, reader })
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(reuse = true)]
|
||||
fn new(
|
||||
|
@ -157,32 +132,19 @@ impl Index {
|
|||
) -> PyResult<()> {
|
||||
let index = match path {
|
||||
Some(p) => {
|
||||
let directory = MmapDirectory::open(p);
|
||||
|
||||
let dir = match directory {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
return Err(exceptions::OSError::py_err(e.to_string()))
|
||||
}
|
||||
};
|
||||
|
||||
let i = if reuse {
|
||||
tv::Index::open_or_create(dir, schema.inner.clone())
|
||||
let directory = MmapDirectory::open(p).map_err(to_pyerr)?;
|
||||
if reuse {
|
||||
tv::Index::open_or_create(directory, schema.inner.clone())
|
||||
} else {
|
||||
tv::Index::create(dir, schema.inner.clone())
|
||||
};
|
||||
|
||||
match i {
|
||||
Ok(index) => index,
|
||||
Err(e) => {
|
||||
return Err(exceptions::OSError::py_err(e.to_string()))
|
||||
}
|
||||
tv::Index::create(directory, schema.inner.clone())
|
||||
}
|
||||
.map_err(to_pyerr)?
|
||||
}
|
||||
None => tv::Index::create_in_ram(schema.inner.clone()),
|
||||
};
|
||||
|
||||
obj.init(Index { inner: index });
|
||||
let reader = index.reader().map_err(to_pyerr)?;
|
||||
obj.init(Index { index, reader });
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -206,32 +168,30 @@ impl Index {
|
|||
num_threads: usize,
|
||||
) -> PyResult<IndexWriter> {
|
||||
let writer = match num_threads {
|
||||
0 => self.inner.writer(heap_size),
|
||||
_ => self.inner.writer_with_num_threads(num_threads, heap_size),
|
||||
};
|
||||
|
||||
match writer {
|
||||
Ok(w) => Ok(IndexWriter { inner: w }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
0 => self.index.writer(heap_size),
|
||||
_ => self.index.writer_with_num_threads(num_threads, heap_size),
|
||||
}
|
||||
.map_err(to_pyerr)?;
|
||||
let schema = self.index.schema();
|
||||
Ok(IndexWriter {
|
||||
inner_index_writer: writer,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create an IndexReader for the index.
|
||||
/// Configure the index reader.
|
||||
///
|
||||
/// Args:
|
||||
/// reload_policy (str, optional): The reload policy that the
|
||||
/// IndexReader should use. Can be manual or OnCommit.
|
||||
/// IndexReader should use. Can be `Manual` or `OnCommit`.
|
||||
/// num_searchers (int, optional): The number of searchers that the
|
||||
/// reader should create.
|
||||
///
|
||||
/// Returns the IndexReader on success, raises ValueError if a IndexReader
|
||||
/// couldn't be created.
|
||||
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
|
||||
fn reader(
|
||||
&self,
|
||||
fn config_reader(
|
||||
&mut self,
|
||||
reload_policy: &str,
|
||||
num_searchers: usize,
|
||||
) -> PyResult<IndexReader> {
|
||||
) -> Result<(), PyErr> {
|
||||
let reload_policy = reload_policy.to_lowercase();
|
||||
let reload_policy = match reload_policy.as_ref() {
|
||||
"commit" => tv::ReloadPolicy::OnCommit,
|
||||
|
@ -242,9 +202,7 @@ impl Index {
|
|||
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
|
||||
))
|
||||
};
|
||||
|
||||
let builder = self.inner.reader_builder();
|
||||
|
||||
let builder = self.index.reader_builder();
|
||||
let builder = builder.reload_policy(reload_policy);
|
||||
let builder = if num_searchers > 0 {
|
||||
builder.num_searchers(num_searchers)
|
||||
|
@ -252,10 +210,23 @@ impl Index {
|
|||
builder
|
||||
};
|
||||
|
||||
let reader = builder.try_into();
|
||||
match reader {
|
||||
Ok(r) => Ok(IndexReader { inner: r }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
self.reader = builder.try_into().map_err(to_pyerr)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Acquires a Searcher from the searcher pool.
|
||||
///
|
||||
/// If no searcher is available during the call, note that
|
||||
/// this call will block until one is made available.
|
||||
///
|
||||
/// Searcher are automatically released back into the pool when
|
||||
/// they are dropped. If you observe this function to block forever
|
||||
/// you probably should configure the Index to have a larger
|
||||
/// searcher pool, or you are holding references to previous searcher
|
||||
/// for ever.
|
||||
fn searcher(&self) -> Searcher {
|
||||
Searcher {
|
||||
inner: self.reader.searcher(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -268,19 +239,71 @@ impl Index {
|
|||
/// Raises OSError if the directory cannot be opened.
|
||||
#[staticmethod]
|
||||
fn exists(path: &str) -> PyResult<bool> {
|
||||
let directory = MmapDirectory::open(path);
|
||||
let dir = match directory {
|
||||
Ok(d) => d,
|
||||
Err(e) => return Err(exceptions::OSError::py_err(e.to_string())),
|
||||
};
|
||||
|
||||
Ok(tv::Index::exists(&dir))
|
||||
let directory = MmapDirectory::open(path).map_err(to_pyerr)?;
|
||||
Ok(tv::Index::exists(&directory))
|
||||
}
|
||||
|
||||
/// The schema of the current index.
|
||||
#[getter]
|
||||
fn schema(&self) -> Schema {
|
||||
let schema = self.inner.schema();
|
||||
let schema = self.index.schema();
|
||||
Schema { inner: schema }
|
||||
}
|
||||
|
||||
/// Update searchers so that they reflect the state of the last .commit().
|
||||
///
|
||||
/// If you set up the the reload policy to be on 'commit' (which is the
|
||||
/// default) every commit should be rapidly reflected on your IndexReader
|
||||
/// and you should not need to call reload() at all.
|
||||
fn reload(&self) -> PyResult<()> {
|
||||
self.reader.reload().map_err(to_pyerr)
|
||||
}
|
||||
|
||||
/// Parse a query
|
||||
///
|
||||
/// Args:
|
||||
/// query: the query, following the tantivy query language.
|
||||
/// default_fields (List[Field]): A list of fields used to search if no
|
||||
/// field is specified in the query.
|
||||
///
|
||||
#[args(reload_policy = "RELOAD_POLICY")]
|
||||
pub fn parse_query(
|
||||
&self,
|
||||
query: &str,
|
||||
default_field_names: Option<Vec<String>>,
|
||||
) -> PyResult<Query> {
|
||||
let mut default_fields = vec![];
|
||||
let schema = self.index.schema();
|
||||
if let Some(default_field_names_vec) = default_field_names {
|
||||
for default_field_name in &default_field_names_vec {
|
||||
if let Some(field) = schema.get_field(default_field_name) {
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
if !field_entry.is_indexed() {
|
||||
return Err(exceptions::ValueError::py_err(format!(
|
||||
"Field `{}` is not set as indexed in the schema.",
|
||||
default_field_name
|
||||
)));
|
||||
}
|
||||
default_fields.push(field);
|
||||
} else {
|
||||
return Err(exceptions::ValueError::py_err(format!(
|
||||
"Field `{}` is not defined in the schema.",
|
||||
default_field_name
|
||||
)));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (field_id, field_entry) in
|
||||
self.index.schema().fields().iter().enumerate()
|
||||
{
|
||||
if field_entry.is_indexed() {
|
||||
default_fields.push(Field(field_id as u32));
|
||||
}
|
||||
}
|
||||
}
|
||||
let parser =
|
||||
tv::query::QueryParser::for_index(&self.index, default_fields);
|
||||
let query = parser.parse_query(query).map_err(to_pyerr)?;
|
||||
Ok(Query { inner: query })
|
||||
}
|
||||
}
|
||||
|
|
14
src/lib.rs
14
src/lib.rs
|
@ -1,8 +1,8 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
mod document;
|
||||
mod facet;
|
||||
mod field;
|
||||
mod index;
|
||||
mod query;
|
||||
mod schema;
|
||||
|
@ -11,9 +11,7 @@ mod searcher;
|
|||
|
||||
use document::Document;
|
||||
use facet::Facet;
|
||||
use field::{Field, FieldValue};
|
||||
use index::Index;
|
||||
use query::QueryParser;
|
||||
use schema::Schema;
|
||||
use schemabuilder::SchemaBuilder;
|
||||
use searcher::{DocAddress, Searcher, TopDocs};
|
||||
|
@ -75,14 +73,14 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||
m.add_class::<Schema>()?;
|
||||
m.add_class::<SchemaBuilder>()?;
|
||||
m.add_class::<Searcher>()?;
|
||||
m.add_class::<Index>()?;
|
||||
m.add_class::<QueryParser>()?;
|
||||
m.add_class::<Document>()?;
|
||||
m.add_class::<Index>()?;
|
||||
m.add_class::<DocAddress>()?;
|
||||
m.add_class::<TopDocs>()?;
|
||||
m.add_class::<Field>()?;
|
||||
m.add_class::<FieldValue>()?;
|
||||
m.add_class::<Facet>()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn to_pyerr<E: ToString>(err: E) -> PyErr {
|
||||
exceptions::ValueError::py_err(err.to_string())
|
||||
}
|
||||
|
|
64
src/query.rs
64
src/query.rs
|
@ -1,70 +1,16 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyType;
|
||||
|
||||
use pyo3::PyObjectProtocol;
|
||||
use tantivy as tv;
|
||||
|
||||
use crate::field::Field;
|
||||
use crate::index::Index;
|
||||
|
||||
/// Tantivy's Query
|
||||
#[pyclass]
|
||||
pub(crate) struct Query {
|
||||
pub(crate) inner: Box<dyn tv::query::Query>,
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
#[pyclass]
|
||||
pub(crate) struct QueryParser {
|
||||
inner: tv::query::QueryParser,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl QueryParser {
|
||||
/// Creates a QueryParser for an Index.
|
||||
///
|
||||
/// Args:
|
||||
/// index (Index): The index for which the query will be created.
|
||||
/// default_fields (List[Field]): A list of fields used to search if no
|
||||
/// field is specified in the query.
|
||||
///
|
||||
/// Returns the QueryParser.
|
||||
#[classmethod]
|
||||
fn for_index(
|
||||
_cls: &PyType,
|
||||
index: &Index,
|
||||
default_fields: Vec<&Field>,
|
||||
) -> PyResult<QueryParser> {
|
||||
let default_fields: Vec<tv::schema::Field> =
|
||||
default_fields.iter().map(|&f| f.inner).collect();
|
||||
|
||||
let parser =
|
||||
tv::query::QueryParser::for_index(&index.inner, default_fields);
|
||||
Ok(QueryParser { inner: parser })
|
||||
}
|
||||
|
||||
/// Parse a string into a query that can be given to a searcher.
|
||||
///
|
||||
/// Args:
|
||||
/// query (str): A query string that should be parsed into a query.
|
||||
///
|
||||
/// Returns the parsed Query object. Raises ValueError if there was an
|
||||
/// error with the query string.
|
||||
fn parse_query(&self, query: &str) -> PyResult<Query> {
|
||||
let ret = self.inner.parse_query(query);
|
||||
|
||||
match ret {
|
||||
Ok(q) => Ok(Query { inner: q }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the default way to compose queries to a conjunction.
|
||||
///
|
||||
/// By default, the query happy tax payer is equivalent to the query happy
|
||||
/// OR tax OR payer. After calling .set_conjunction_by_default() happy tax
|
||||
/// payer will be interpreted by the parser as happy AND tax AND payer.
|
||||
fn set_conjunction_by_default(&mut self) {
|
||||
self.inner.set_conjunction_by_default();
|
||||
#[pyproto]
|
||||
impl PyObjectProtocol for Query {
|
||||
fn __repr__(&self) -> PyResult<String> {
|
||||
Ok(format!("Query({:?})", self.inner))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
use tantivy::schema;
|
||||
|
||||
use crate::document::Document;
|
||||
use crate::field::Field;
|
||||
use tantivy as tv;
|
||||
|
||||
/// Tantivy schema.
|
||||
///
|
||||
|
@ -12,54 +7,8 @@ use crate::field::Field;
|
|||
/// provided.
|
||||
#[pyclass]
|
||||
pub(crate) struct Schema {
|
||||
pub(crate) inner: schema::Schema,
|
||||
pub(crate) inner: tv::schema::Schema,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Schema {
|
||||
/// Build a document object from a json string.
|
||||
///
|
||||
/// Args:
|
||||
/// doc_json (str) - A string containing json that should be parsed
|
||||
/// into a `Document`
|
||||
///
|
||||
/// Returns the parsed document, raises a ValueError if the parsing failed.
|
||||
fn parse_document(&self, doc_json: &str) -> PyResult<Document> {
|
||||
let ret = self.inner.parse_document(doc_json);
|
||||
match ret {
|
||||
Ok(d) => Ok(Document { inner: d }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a `Document` object into a json string.
|
||||
///
|
||||
/// Args:
|
||||
/// doc (Document): The document that will be converted into a json
|
||||
/// string.
|
||||
fn to_json(&self, doc: &Document) -> String {
|
||||
self.inner.to_json(&doc.inner)
|
||||
}
|
||||
|
||||
/// Return the field name for a given `Field`.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which the name will be returned.
|
||||
fn get_field_name(&self, field: &Field) -> &str {
|
||||
self.inner.get_field_name(field.inner)
|
||||
}
|
||||
|
||||
/// Returns the field option associated with a given name.
|
||||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field that we want to retrieve.
|
||||
///
|
||||
/// Returns the Field if one is found, None otherwise.
|
||||
fn get_field(&self, name: &str) -> Option<Field> {
|
||||
let f = self.inner.get_field(name);
|
||||
match f {
|
||||
Some(field) => Some(Field { inner: field }),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl Schema {}
|
||||
|
|
|
@ -5,8 +5,8 @@ use pyo3::prelude::*;
|
|||
|
||||
use tantivy::schema;
|
||||
|
||||
use crate::field::Field;
|
||||
use crate::schema::Schema;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
/// Tantivy has a very strict schema.
|
||||
/// You need to specify in advance whether a field is indexed or not,
|
||||
|
@ -24,8 +24,9 @@ use crate::schema::Schema;
|
|||
///
|
||||
/// >>> schema = builder.build()
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct SchemaBuilder {
|
||||
pub(crate) builder: Option<schema::SchemaBuilder>,
|
||||
pub(crate) builder: Arc<RwLock<Option<schema::SchemaBuilder>>>,
|
||||
}
|
||||
|
||||
const TOKENIZER: &str = "default";
|
||||
|
@ -36,7 +37,7 @@ impl SchemaBuilder {
|
|||
#[new]
|
||||
fn new(obj: &PyRawObject) {
|
||||
obj.init(SchemaBuilder {
|
||||
builder: Some(schema::Schema::builder()),
|
||||
builder: Arc::new(From::from(Some(schema::Schema::builder()))),
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -70,9 +71,8 @@ impl SchemaBuilder {
|
|||
stored: bool,
|
||||
tokenizer_name: &str,
|
||||
index_option: &str,
|
||||
) -> PyResult<Field> {
|
||||
) -> PyResult<Self> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let index_option = match index_option {
|
||||
"position" => schema::IndexRecordOption::WithFreqsAndPositions,
|
||||
"freq" => schema::IndexRecordOption::WithFreqs,
|
||||
|
@ -94,14 +94,14 @@ impl SchemaBuilder {
|
|||
options
|
||||
};
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_text_field(name, options);
|
||||
Ok(Field { inner: field })
|
||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||
builder.add_text_field(name, options);
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
return Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
));
|
||||
}
|
||||
Ok(self.clone())
|
||||
}
|
||||
|
||||
/// Add a new signed integer field to the schema.
|
||||
|
@ -131,19 +131,19 @@ impl SchemaBuilder {
|
|||
stored: bool,
|
||||
indexed: bool,
|
||||
fast: Option<&str>,
|
||||
) -> PyResult<Field> {
|
||||
) -> PyResult<Self> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_i64_field(name, opts);
|
||||
Ok(Field { inner: field })
|
||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||
builder.add_i64_field(name, opts);
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
return Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
));
|
||||
}
|
||||
Ok(self.clone())
|
||||
}
|
||||
|
||||
/// Add a new unsigned integer field to the schema.
|
||||
|
@ -173,19 +173,19 @@ impl SchemaBuilder {
|
|||
stored: bool,
|
||||
indexed: bool,
|
||||
fast: Option<&str>,
|
||||
) -> PyResult<Field> {
|
||||
) -> PyResult<Self> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_u64_field(name, opts);
|
||||
Ok(Field { inner: field })
|
||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||
builder.add_u64_field(name, opts);
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
return Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
));
|
||||
}
|
||||
Ok(self.clone())
|
||||
}
|
||||
|
||||
/// Add a new date field to the schema.
|
||||
|
@ -215,35 +215,35 @@ impl SchemaBuilder {
|
|||
stored: bool,
|
||||
indexed: bool,
|
||||
fast: Option<&str>,
|
||||
) -> PyResult<Field> {
|
||||
) -> PyResult<Self> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_date_field(name, opts);
|
||||
Ok(Field { inner: field })
|
||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||
builder.add_date_field(name, opts);
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
return Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
));
|
||||
}
|
||||
Ok(self.clone())
|
||||
}
|
||||
|
||||
/// Add a Facet field to the schema.
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
fn add_facet_field(&mut self, name: &str) -> PyResult<Field> {
|
||||
fn add_facet_field(&mut self, name: &str) -> PyResult<Self> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_facet_field(name);
|
||||
Ok(Field { inner: field })
|
||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||
builder.add_facet_field(name);
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
return Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
));
|
||||
}
|
||||
Ok(self.clone())
|
||||
}
|
||||
|
||||
/// Add a fast bytes field to the schema.
|
||||
|
@ -254,17 +254,17 @@ impl SchemaBuilder {
|
|||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
fn add_bytes_field(&mut self, name: &str) -> PyResult<Field> {
|
||||
fn add_bytes_field(&mut self, name: &str) -> PyResult<Self> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_bytes_field(name);
|
||||
Ok(Field { inner: field })
|
||||
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||
builder.add_bytes_field(name);
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
return Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
));
|
||||
}
|
||||
Ok(self.clone())
|
||||
}
|
||||
|
||||
/// Finalize the creation of a Schema.
|
||||
|
@ -272,7 +272,7 @@ impl SchemaBuilder {
|
|||
/// Returns a Schema object. After this is called the SchemaBuilder cannot
|
||||
/// be used anymore.
|
||||
fn build(&mut self) -> PyResult<Schema> {
|
||||
let builder = self.builder.take();
|
||||
let builder = self.builder.write().unwrap().take();
|
||||
if let Some(builder) = builder {
|
||||
let schema = builder.build();
|
||||
Ok(Schema { inner: schema })
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
#![allow(clippy::new_ret_no_self)]
|
||||
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
use tantivy as tv;
|
||||
|
||||
use crate::document::Document;
|
||||
use crate::query::Query;
|
||||
use crate::to_pyerr;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::{exceptions, PyObjectProtocol};
|
||||
use tantivy as tv;
|
||||
|
||||
/// Tantivy's Searcher class
|
||||
///
|
||||
|
@ -60,11 +59,11 @@ impl Searcher {
|
|||
///
|
||||
/// Returns the Document, raises ValueError if the document can't be found.
|
||||
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
|
||||
let ret = self.inner.doc(doc_address.into());
|
||||
match ret {
|
||||
Ok(doc) => Ok(Document { inner: doc }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
|
||||
let named_doc = self.inner.schema().to_named_doc(&doc);
|
||||
Ok(Document {
|
||||
field_values: named_doc.0,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -132,3 +131,14 @@ impl TopDocs {
|
|||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[pyproto]
|
||||
impl PyObjectProtocol for Searcher {
|
||||
fn __repr__(&self) -> PyResult<String> {
|
||||
Ok(format!(
|
||||
"Searcher(num_docs={}, num_segments={})",
|
||||
self.inner.num_docs(),
|
||||
self.inner.segment_readers().len()
|
||||
))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,192 +1,188 @@
|
|||
import json
|
||||
import tantivy
|
||||
|
||||
import pytest
|
||||
|
||||
from tantivy import Document, Index, SchemaBuilder, Schema
|
||||
|
||||
|
||||
def schema():
|
||||
return SchemaBuilder() \
|
||||
.add_text_field("title", stored=True) \
|
||||
.add_text_field("body") \
|
||||
.build()
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def ram_index():
|
||||
# assume all tests will use the same documents for now
|
||||
# other methods may set up function-local indexes
|
||||
builder = tantivy.SchemaBuilder()
|
||||
# assume all tests will use the same documents for now
|
||||
# other methods may set up function-local indexes
|
||||
index = Index(schema())
|
||||
writer = index.writer()
|
||||
|
||||
title = builder.add_text_field("title", stored=True)
|
||||
body = builder.add_text_field("body")
|
||||
|
||||
schema = builder.build()
|
||||
index = tantivy.Index(schema)
|
||||
|
||||
writer = index.writer()
|
||||
|
||||
# 2 ways of adding documents
|
||||
# 1
|
||||
doc = tantivy.Document()
|
||||
# create a document instance
|
||||
# add field-value pairs
|
||||
doc.add_text(title, "The Old Man and the Sea")
|
||||
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."))
|
||||
writer.add_document(doc)
|
||||
# 2 use the built-in json support
|
||||
# keys need to coincide with field names
|
||||
doc = schema.parse_document(json.dumps({
|
||||
"title": "Of Mice and Men",
|
||||
"body": ("A few miles south of Soledad, the Salinas River drops "
|
||||
"in close to the hillside bank and runs deep and "
|
||||
"green. The water is warm too, for it has slipped "
|
||||
"twinkling over the yellow sands in the sunlight "
|
||||
"before reaching the narrow pool. On one side of the "
|
||||
"river the golden foothill slopes curve up to the "
|
||||
"strong and rocky Gabilan Mountains, but on the valley "
|
||||
"side the water is lined with trees—willows fresh and "
|
||||
"green with every spring, carrying in their lower leaf "
|
||||
"junctures the debris of the winter’s flooding; and "
|
||||
"sycamores with mottled, white, recumbent limbs and "
|
||||
"branches that arch over the pool")
|
||||
}))
|
||||
|
||||
writer.add_document(doc)
|
||||
|
||||
doc = schema.parse_document(json.dumps({
|
||||
# 2 ways of adding documents
|
||||
# 1
|
||||
doc = Document()
|
||||
# create a document instance
|
||||
# add field-value pairs
|
||||
doc.add_text("title", "The Old Man and the Sea")
|
||||
doc.add_text("body", ("He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."))
|
||||
writer.add_document(doc)
|
||||
# 2 use the built-in json support
|
||||
# keys need to coincide with field names
|
||||
doc = Document.from_dict({
|
||||
"title": "Of Mice and Men",
|
||||
"body": ("A few miles south of Soledad, the Salinas River drops "
|
||||
"in close to the hillside bank and runs deep and "
|
||||
"green. The water is warm too, for it has slipped "
|
||||
"twinkling over the yellow sands in the sunlight "
|
||||
"before reaching the narrow pool. On one side of the "
|
||||
"river the golden foothill slopes curve up to the "
|
||||
"strong and rocky Gabilan Mountains, but on the valley "
|
||||
"side the water is lined with trees—willows fresh and "
|
||||
"green with every spring, carrying in their lower leaf "
|
||||
"junctures the debris of the winter’s flooding; and "
|
||||
"sycamores with mottled, white, recumbent limbs and "
|
||||
"branches that arch over the pool")
|
||||
})
|
||||
writer.add_document(doc)
|
||||
writer.add_json("""{
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": ("You will rejoice to hear that no disaster has "
|
||||
"accompanied the commencement of an enterprise which you "
|
||||
"have regarded with such evil forebodings. I arrived "
|
||||
"here yesterday, and my first task is to assure my dear "
|
||||
"sister of my welfare and increasing confidence in the "
|
||||
"success of my undertaking.")
|
||||
}))
|
||||
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
|
||||
reader = index.reader()
|
||||
searcher = reader.searcher()
|
||||
index = index
|
||||
schema = schema
|
||||
default_args = [title, body]
|
||||
ret = (index, searcher, schema, default_args, title, body)
|
||||
return ret
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
||||
}""")
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
class TestClass(object):
|
||||
|
||||
def test_simple_search(self, ram_index):
|
||||
index, searcher, schema, default_args, title, body = ram_index
|
||||
query_parser = tantivy.QueryParser.for_index(index, default_args)
|
||||
query = query_parser.parse_query("sea whale")
|
||||
index = ram_index
|
||||
query = index.parse_query("sea whale", ["title", "body"])
|
||||
|
||||
top_docs = tantivy.TopDocs(10)
|
||||
|
||||
result = searcher.search(query, top_docs)
|
||||
print(result)
|
||||
|
||||
result = index.searcher().search(query, top_docs)
|
||||
assert len(result) == 1
|
||||
|
||||
_, doc_address = result[0]
|
||||
|
||||
searched_doc = searcher.doc(doc_address)
|
||||
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
||||
|
||||
def test_doc(self):
|
||||
builder = tantivy.SchemaBuilder()
|
||||
title = builder.add_text_field("title", stored=True)
|
||||
|
||||
doc = tantivy.Document()
|
||||
assert doc.is_empty
|
||||
|
||||
doc.add_text(title, "The Old Man and the Sea")
|
||||
|
||||
assert doc.get_first(title) == "The Old Man and the Sea"
|
||||
|
||||
assert doc.len == 1
|
||||
assert not doc.is_empty
|
||||
searched_doc = index.searcher().doc(doc_address)
|
||||
assert searched_doc["title"] == ["The Old Man and the Sea"]
|
||||
|
||||
def test_and_query(self, ram_index):
|
||||
index, searcher, schema, default_args, title, body = ram_index
|
||||
q_parser = tantivy.QueryParser.for_index(index, default_args)
|
||||
index = ram_index
|
||||
query = index.parse_query("title:men AND body:summer", default_field_names=["title", "body"])
|
||||
# look for an intersection of documents
|
||||
query = q_parser.parse_query("title:men AND body:summer")
|
||||
top_docs = tantivy.TopDocs(10)
|
||||
|
||||
searcher = index.searcher()
|
||||
result = searcher.search(query, top_docs)
|
||||
print(result)
|
||||
|
||||
# summer isn't present
|
||||
assert len(result) == 0
|
||||
|
||||
query = q_parser.parse_query("title:men AND body:winter")
|
||||
query = index.parse_query("title:men AND body:winter", ["title", "body"])
|
||||
result = searcher.search(query, top_docs)
|
||||
|
||||
assert len(result) == 1
|
||||
|
||||
def test_and_query_parser_default_fields(self, ram_index):
|
||||
query = ram_index.parse_query("winter", default_field_names=["title"])
|
||||
assert repr(query) == """Query(TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114])))"""
|
||||
|
||||
def test_and_query_parser_default_fields_undefined(self, ram_index):
|
||||
query = ram_index.parse_query("winter")
|
||||
assert repr(query) == "Query(BooleanQuery { subqueries: [" \
|
||||
"(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " \
|
||||
"(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \
|
||||
"})"
|
||||
|
||||
def test_query_errors(self, ram_index):
|
||||
index, searcher, schema, default_args, title, body = ram_index
|
||||
q_parser = tantivy.QueryParser.for_index(index, default_args)
|
||||
index = ram_index
|
||||
# no "bod" field
|
||||
with pytest.raises(ValueError):
|
||||
q_parser.parse_query("bod:title")
|
||||
index.parse_query("bod:men", ["title", "body"])
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def disk_index():
|
||||
builder = tantivy.SchemaBuilder()
|
||||
title = builder.add_text_field("title", stored=True)
|
||||
body = builder.add_text_field("body")
|
||||
default_args = [title, body]
|
||||
schema = builder.build()
|
||||
schema = schema
|
||||
index = tantivy.Index(schema)
|
||||
path_to_index = "tests/test_index/"
|
||||
return index, path_to_index, schema, default_args, title, body
|
||||
PATH_TO_INDEX = "tests/test_index/"
|
||||
|
||||
|
||||
class TestFromDiskClass(object):
|
||||
|
||||
def test_exists(self, disk_index):
|
||||
def test_exists(self):
|
||||
# prefer to keep it separate in case anyone deletes this
|
||||
# runs from the root directory
|
||||
index, path_to_index, _, _, _, _ = disk_index
|
||||
assert index.exists(path_to_index)
|
||||
assert Index.exists(PATH_TO_INDEX)
|
||||
|
||||
def test_opens_from_dir(self, disk_index):
|
||||
_, path_to_index, schema, _, _, _ = disk_index
|
||||
tantivy.Index(schema, path_to_index)
|
||||
def test_opens_from_dir(self):
|
||||
index = Index(schema(), PATH_TO_INDEX, reuse=True)
|
||||
assert index.searcher().num_docs == 3
|
||||
|
||||
def test_create_readers(self, disk_index):
|
||||
_, path_to_index, schema, _, _, _ = disk_index
|
||||
idx = tantivy.Index(schema, path_to_index)
|
||||
reload_policy = "OnCommit" # or "Manual"
|
||||
assert idx.reader(reload_policy, 4)
|
||||
assert idx.reader("Manual", 4)
|
||||
def test_create_readers(self):
|
||||
# not sure what is the point of this test.
|
||||
idx = Index(schema())
|
||||
assert idx.searcher().num_docs == 0
|
||||
# by default this is manual mode
|
||||
writer = idx.writer(30000000, 1)
|
||||
writer.add_document(Document(title="mytitle", body="mybody"))
|
||||
writer.commit()
|
||||
assert idx.searcher().num_docs == 0
|
||||
# Manual is the default setting.
|
||||
# In this case, change are reflected only when
|
||||
# the index is manually reloaded.
|
||||
idx.reload()
|
||||
assert idx.searcher().num_docs == 1
|
||||
idx.config_reader("OnCommit", 4)
|
||||
writer.add_document(Document(title="mytitle2", body="mybody2"))
|
||||
writer.commit()
|
||||
import time
|
||||
for i in range(50):
|
||||
# The index should be automatically reloaded.
|
||||
# Wait for at most 5s for it to happen.
|
||||
time.sleep(0.1)
|
||||
if idx.searcher().num_docs == 2:
|
||||
return
|
||||
assert False
|
||||
|
||||
def test_create_writer_and_reader(self, disk_index):
|
||||
_, path_to_index, schema, default_args, title, body = disk_index
|
||||
idx = tantivy.Index(schema, path_to_index)
|
||||
writer = idx.writer()
|
||||
reload_policy = "OnCommit" # or "Manual"
|
||||
reader = idx.reader(reload_policy, 4)
|
||||
|
||||
# check against the opstamp in the meta file
|
||||
meta_fname = "meta.json"
|
||||
with open("{}{}".format(path_to_index, meta_fname)) as f:
|
||||
json_file = json.load(f)
|
||||
expected_last_opstamp = json_file["opstamp"]
|
||||
# ASSUMPTION
|
||||
# We haven't had any deletes in the index
|
||||
# so max_doc per index coincides with the value of `num_docs`
|
||||
# summing them in all segments, gives the number of documents
|
||||
expected_num_docs = sum([segment["max_doc"]
|
||||
for segment in json_file["segments"]])
|
||||
assert writer.commit_opstamp == expected_last_opstamp
|
||||
class TestSearcher(object):
|
||||
def test_searcher_repr(self, ram_index):
|
||||
assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)"
|
||||
|
||||
q_parser = tantivy.QueryParser.for_index(idx, default_args)
|
||||
# get all documents
|
||||
query = q_parser.parse_query("*")
|
||||
top_docs = tantivy.TopDocs(10)
|
||||
|
||||
docs = reader.searcher().search(query, top_docs)
|
||||
for (_score, doc_addr) in docs:
|
||||
print(reader.searcher().doc(doc_addr))
|
||||
assert expected_num_docs == len(docs)
|
||||
class TestDocument(object):
|
||||
|
||||
def test_document(self):
|
||||
doc = tantivy.Document(name="Bill", reference=[1, 2])
|
||||
assert doc["reference"] == [1, 2]
|
||||
assert doc["name"] == ["Bill"]
|
||||
assert doc.get_first("name") == "Bill"
|
||||
assert doc.get_first("reference") == 1
|
||||
assert doc.to_dict() == {"name": ["Bill"], "reference": [1, 2]}
|
||||
|
||||
def test_document_with_date(self):
|
||||
import datetime
|
||||
date = datetime.datetime(2019, 8, 12, 13, 0, 0, )
|
||||
doc = tantivy.Document(name="Bill", date=date)
|
||||
assert doc["date"][0] == date
|
||||
|
||||
def test_document_repr(self):
|
||||
doc = tantivy.Document(name="Bill", reference=[1, 2])
|
||||
assert repr(doc) == "Document(name=[Bill],reference=[1,2])"
|
||||
|
||||
def test_document_with_facet(self):
|
||||
doc = tantivy.Document()
|
||||
facet = tantivy.Facet.from_string("/europe/france")
|
||||
doc.add_facet("facet", facet)
|
||||
assert doc["facet"][0].to_path() == ['europe', 'france']
|
||||
doc = tantivy.Document()
|
||||
facet = tantivy.Facet.from_string("/asia\\/oceania/fiji")
|
||||
doc.add_facet("facet", facet)
|
||||
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
|
||||
assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji"
|
||||
assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)"
|
||||
doc = tantivy.Document(facet=facet)
|
||||
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
|
||||
|
||||
def test_document_error(self):
|
||||
with pytest.raises(ValueError):
|
||||
tantivy.Document(name={})
|
||||
|
|
Loading…
Reference in New Issue