commit
f57c4669c8
|
@ -11,7 +11,8 @@ crate-type = ["cdylib"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
tantivy = { git = "https://github.com/tantivy-search/tantivy" }
|
tantivy = { git = "https://github.com/tantivy-search/tantivy", branch = "master" }
|
||||||
|
itertools = "0.8"
|
||||||
|
|
||||||
[dependencies.pyo3]
|
[dependencies.pyo3]
|
||||||
version = "0.7.0"
|
version = "0.7.0"
|
||||||
|
|
50
README.md
50
README.md
|
@ -24,35 +24,37 @@ needs to be built. After that documents can be added to the index and a reader
|
||||||
can be created to search the index.
|
can be created to search the index.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
builder = tantivy.SchemaBuilder()
|
import tantivy
|
||||||
|
|
||||||
title = builder.add_text_field("title", stored=True)
|
# Declaring our schema.
|
||||||
body = builder.add_text_field("body")
|
schema_builder = tantivy.SchemaBuilder()
|
||||||
|
schema_builder.add_text_field("title", stored=True)
|
||||||
|
schema_builder.add_text_field("body", stored=True)
|
||||||
|
schema = schema_builder.build()
|
||||||
|
|
||||||
schema = builder.build()
|
# Creating our index (in memory, but filesystem is available too)
|
||||||
index = tantivy.Index(schema)
|
index = tantivy.Index(schema)
|
||||||
|
|
||||||
writer = index.writer()
|
|
||||||
|
|
||||||
doc = tantivy.Document()
|
# Adding one document.
|
||||||
doc.add_text(title, "The Old Man and the Sea")
|
writer = index.writer()
|
||||||
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
writer.add_document({
|
||||||
"the Gulf Stream and he had gone eighty-four days "
|
"title": "The Old Man and the Sea",
|
||||||
"now without taking a fish."))
|
"body": """He was an old man who fished alone in a skiff in
|
||||||
writer.add_document(doc)
|
the Gulf Stream and he had gone eighty-four days
|
||||||
writer.commit()
|
now without taking a fish."""
|
||||||
|
})
|
||||||
|
# ... and committing
|
||||||
|
writer.commit()
|
||||||
|
|
||||||
reader = index.reader()
|
|
||||||
searcher = reader.searcher()
|
|
||||||
|
|
||||||
query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
# Reload the index to ensure it points to the last commit.
|
||||||
query = query_parser.parse_query("sea whale")
|
index.reload();
|
||||||
|
searcher = index.searcher()
|
||||||
|
query = index.parse_query("sea whale", ["title", "body"])
|
||||||
|
top_docs = tantivy.TopDocs(3)
|
||||||
|
|
||||||
top_docs = tantivy.TopDocs(10)
|
(best_score, best_doc_address) = searcher.search(query, nhits=3)[0]
|
||||||
result = searcher.search(query, top_docs)
|
best_doc = searcher.doc(best_doc_address)
|
||||||
|
assert best_doc["title"] == ["The Old Man and the Sea"]
|
||||||
_, doc_address = result[0]
|
|
||||||
|
|
||||||
searched_doc = searcher.doc(doc_address)
|
|
||||||
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
|
||||||
```
|
```
|
||||||
|
|
307
src/document.rs
307
src/document.rs
|
@ -1,75 +1,255 @@
|
||||||
#![allow(clippy::new_ret_no_self)]
|
#![allow(clippy::new_ret_no_self)]
|
||||||
|
#![allow(clippy::wrong_self_convention)]
|
||||||
|
|
||||||
|
use itertools::Itertools;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::PyDateTime;
|
use pyo3::types::{PyAny, PyDateTime, PyDict, PyList, PyTuple};
|
||||||
use pyo3::types::{PyDateAccess, PyTimeAccess};
|
use pyo3::types::{PyDateAccess, PyTimeAccess};
|
||||||
|
|
||||||
use chrono::offset::TimeZone;
|
use chrono::offset::TimeZone;
|
||||||
use chrono::Utc;
|
use chrono::{Datelike, Timelike, Utc};
|
||||||
|
|
||||||
use tantivy as tv;
|
use tantivy as tv;
|
||||||
|
|
||||||
use crate::facet::Facet;
|
use crate::facet::Facet;
|
||||||
use crate::field::{Field, FieldValue};
|
use crate::to_pyerr;
|
||||||
|
use pyo3::{PyMappingProtocol, PyObjectProtocol};
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::fmt;
|
||||||
|
use tantivy::schema::Value;
|
||||||
|
|
||||||
|
fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
|
||||||
|
Ok(match value {
|
||||||
|
Value::Str(text) => text.into_object(py),
|
||||||
|
Value::U64(num) => num.into_object(py),
|
||||||
|
Value::I64(num) => num.into_object(py),
|
||||||
|
Value::F64(num) => num.into_object(py),
|
||||||
|
Value::Bytes(b) => b.to_object(py),
|
||||||
|
Value::Date(d) => PyDateTime::new(
|
||||||
|
py,
|
||||||
|
d.year(),
|
||||||
|
d.month() as u8,
|
||||||
|
d.day() as u8,
|
||||||
|
d.hour() as u8,
|
||||||
|
d.minute() as u8,
|
||||||
|
d.second() as u8,
|
||||||
|
d.timestamp_subsec_micros(),
|
||||||
|
None,
|
||||||
|
)?
|
||||||
|
.into_object(py),
|
||||||
|
Value::Facet(f) => Facet { inner: f.clone() }.into_object(py),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn value_to_string(value: &Value) -> String {
|
||||||
|
match value {
|
||||||
|
Value::Str(text) => text.clone(),
|
||||||
|
Value::U64(num) => format!("{}", num),
|
||||||
|
Value::I64(num) => format!("{}", num),
|
||||||
|
Value::F64(num) => format!("{}", num),
|
||||||
|
Value::Bytes(bytes) => format!("{:?}", bytes),
|
||||||
|
Value::Date(d) => format!("{:?}", d),
|
||||||
|
Value::Facet(facet) => facet.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Tantivy's Document is the object that can be indexed and then searched for.
|
/// Tantivy's Document is the object that can be indexed and then searched for.
|
||||||
///
|
///
|
||||||
/// Documents are fundamentally a collection of unordered tuples
|
/// Documents are fundamentally a collection of unordered tuples
|
||||||
/// (field, value). In this list, one field may appear more than once.
|
/// (field_name, value). In this list, one field may appear more than once.
|
||||||
///
|
///
|
||||||
/// Example:
|
/// Example:
|
||||||
/// >>> doc = tantivy.Document()
|
/// >>> doc = tantivy.Document()
|
||||||
/// >>> doc.add_text(title, "The Old Man and the Sea")
|
/// >>> doc.add_text("title", "The Old Man and the Sea")
|
||||||
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
|
/// >>> doc.add_text("body", ("He was an old man who fished alone in a "
|
||||||
/// "skiff in the Gulf Stream and he had gone "
|
/// "skiff in the Gulf Stream and he had gone "
|
||||||
/// "eighty-four days now without taking a fish."))
|
/// "eighty-four days now without taking a fish."))
|
||||||
|
///
|
||||||
|
/// For simplicity, it is also possible to build a `Document` by passing the field
|
||||||
|
/// values directly as constructor arguments.
|
||||||
|
///
|
||||||
|
/// Example:
|
||||||
|
/// >>> doc = tantivy.Document(title=["The Old Man and the Sea"], body=["..."])
|
||||||
|
///
|
||||||
|
/// As syntactic sugar, tantivy also allows the user to pass a single values
|
||||||
|
/// if there is only one. In other words, the following is also legal.
|
||||||
|
///
|
||||||
|
/// Example:
|
||||||
|
/// >>> doc = tantivy.Document(title="The Old Man and the Sea", body="...")
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
|
#[derive(Default)]
|
||||||
pub(crate) struct Document {
|
pub(crate) struct Document {
|
||||||
pub(crate) inner: tv::Document,
|
pub(crate) field_values: BTreeMap<String, Vec<tv::schema::Value>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Document {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
let doc_str = self
|
||||||
|
.field_values
|
||||||
|
.iter()
|
||||||
|
.map(|(field_name, field_values)| {
|
||||||
|
let mut values_str =
|
||||||
|
field_values.iter().map(value_to_string).join(",");
|
||||||
|
values_str.truncate(10);
|
||||||
|
format!("{}=[{}]", field_name, values_str)
|
||||||
|
})
|
||||||
|
.join(",");
|
||||||
|
write!(f, "Document({})", doc_str)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn add_value<T>(doc: &mut Document, field_name: String, value: T)
|
||||||
|
where
|
||||||
|
Value: From<T>,
|
||||||
|
{
|
||||||
|
doc.field_values
|
||||||
|
.entry(field_name)
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.push(Value::from(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_value(any: &PyAny) -> PyResult<Value> {
|
||||||
|
if let Ok(s) = any.extract::<String>() {
|
||||||
|
return Ok(Value::Str(s));
|
||||||
|
}
|
||||||
|
if let Ok(num) = any.extract::<i64>() {
|
||||||
|
return Ok(Value::I64(num));
|
||||||
|
}
|
||||||
|
if let Ok(num) = any.extract::<f64>() {
|
||||||
|
return Ok(Value::F64(num));
|
||||||
|
}
|
||||||
|
if let Ok(py_datetime) = any.downcast_ref::<PyDateTime>() {
|
||||||
|
let datetime = Utc
|
||||||
|
.ymd(
|
||||||
|
py_datetime.get_year(),
|
||||||
|
py_datetime.get_month().into(),
|
||||||
|
py_datetime.get_day().into(),
|
||||||
|
)
|
||||||
|
.and_hms_micro(
|
||||||
|
py_datetime.get_hour().into(),
|
||||||
|
py_datetime.get_minute().into(),
|
||||||
|
py_datetime.get_second().into(),
|
||||||
|
py_datetime.get_microsecond(),
|
||||||
|
);
|
||||||
|
return Ok(Value::Date(datetime));
|
||||||
|
}
|
||||||
|
if let Ok(facet) = any.downcast_ref::<Facet>() {
|
||||||
|
return Ok(Value::Facet(facet.inner.clone()));
|
||||||
|
}
|
||||||
|
Err(to_pyerr(format!("Value unsupported {:?}", any)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
|
||||||
|
if let Ok(values) = any.downcast_ref::<PyList>() {
|
||||||
|
values.iter().map(extract_value).collect()
|
||||||
|
} else {
|
||||||
|
Ok(vec![extract_value(any)?])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Document {
|
impl Document {
|
||||||
#[new]
|
#[new]
|
||||||
fn new(obj: &PyRawObject) {
|
#[args(kwargs = "**")]
|
||||||
obj.init(Document {
|
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||||
inner: tv::Document::default(),
|
let mut document = Document::default();
|
||||||
});
|
if let Some(field_dict) = kwargs {
|
||||||
|
document.extend(field_dict)?;
|
||||||
|
}
|
||||||
|
obj.init(document);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extend(&mut self, py_dict: &PyDict) -> PyResult<()> {
|
||||||
|
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
|
||||||
|
BTreeMap::new();
|
||||||
|
for key_value_any in py_dict.items() {
|
||||||
|
if let Ok(key_value) = key_value_any.downcast_ref::<PyTuple>() {
|
||||||
|
if key_value.len() != 2 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let key: String = key_value.get_item(0).extract()?;
|
||||||
|
let value_list =
|
||||||
|
extract_value_single_or_list(key_value.get_item(1))?;
|
||||||
|
field_values.insert(key, value_list);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.field_values.extend(field_values.into_iter());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[staticmethod]
|
||||||
|
fn from_dict(py_dict: &PyDict) -> PyResult<Document> {
|
||||||
|
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
|
||||||
|
BTreeMap::new();
|
||||||
|
for key_value_any in py_dict.items() {
|
||||||
|
if let Ok(key_value) = key_value_any.downcast_ref::<PyTuple>() {
|
||||||
|
if key_value.len() != 2 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let key: String = key_value.get_item(0).extract()?;
|
||||||
|
let value_list =
|
||||||
|
extract_value_single_or_list(key_value.get_item(1))?;
|
||||||
|
field_values.insert(key, value_list);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Document { field_values })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a dictionary with the different
|
||||||
|
/// field values.
|
||||||
|
///
|
||||||
|
/// In tantivy, `Document` can be hold multiple
|
||||||
|
/// values for a single field.
|
||||||
|
///
|
||||||
|
/// For this reason, the dictionary, will associate
|
||||||
|
/// a list of value for every field.
|
||||||
|
fn to_dict(&self, py: Python) -> PyResult<PyObject> {
|
||||||
|
let dict = PyDict::new(py);
|
||||||
|
for (key, values) in &self.field_values {
|
||||||
|
let values_py: Vec<PyObject> = values
|
||||||
|
.iter()
|
||||||
|
.map(|v| value_to_py(py, v))
|
||||||
|
.collect::<PyResult<_>>()?;
|
||||||
|
dict.set_item(key, values_py)?;
|
||||||
|
}
|
||||||
|
Ok(dict.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a text value to the document.
|
/// Add a text value to the document.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// field (Field): The field for which we are adding the text.
|
/// field_name (str): The field name for which we are adding the text.
|
||||||
/// text (str): The text that will be added to the document.
|
/// text (str): The text that will be added to the document.
|
||||||
fn add_text(&mut self, field: &Field, text: &str) {
|
fn add_text(&mut self, field_name: String, text: &str) {
|
||||||
self.inner.add_text(field.inner, text);
|
add_value(self, field_name, text);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add an unsigned integer value to the document.
|
/// Add an unsigned integer value to the document.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// field (Field): The field for which we are adding the integer.
|
/// field_name (str): The field name for which we are adding the unsigned integer.
|
||||||
/// value (int): The integer that will be added to the document.
|
/// value (int): The integer that will be added to the document.
|
||||||
fn add_unsigned(&mut self, field: &Field, value: u64) {
|
fn add_unsigned(&mut self, field_name: String, value: u64) {
|
||||||
self.inner.add_u64(field.inner, value);
|
add_value(self, field_name, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a signed integer value to the document.
|
/// Add a signed integer value to the document.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// field (Field): The field for which we are adding the integer.
|
/// field_name (str): The field name for which we are adding the integer.
|
||||||
/// value (int): The integer that will be added to the document.
|
/// value (int): The integer that will be added to the document.
|
||||||
fn add_integer(&mut self, field: &Field, value: i64) {
|
fn add_integer(&mut self, field_name: String, value: i64) {
|
||||||
self.inner.add_i64(field.inner, value);
|
add_value(self, field_name, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a date value to the document.
|
/// Add a date value to the document.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// field (Field): The field for which we are adding the integer.
|
/// field_name (str): The field name for which we are adding the date.
|
||||||
/// value (datetime): The date that will be added to the document.
|
/// value (datetime): The date that will be added to the document.
|
||||||
fn add_date(&mut self, field: &Field, value: &PyDateTime) {
|
fn add_date(&mut self, field_name: String, value: &PyDateTime) {
|
||||||
let datetime = Utc
|
let datetime = Utc
|
||||||
.ymd(
|
.ymd(
|
||||||
value.get_year(),
|
value.get_year(),
|
||||||
|
@ -82,37 +262,36 @@ impl Document {
|
||||||
value.get_second().into(),
|
value.get_second().into(),
|
||||||
value.get_microsecond(),
|
value.get_microsecond(),
|
||||||
);
|
);
|
||||||
|
add_value(self, field_name, datetime);
|
||||||
self.inner.add_date(field.inner, &datetime);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a facet value to the document.
|
/// Add a facet value to the document.
|
||||||
/// Args:
|
/// Args:
|
||||||
/// field (Field): The field for which we are adding the facet.
|
/// field_name (str): The field name for which we are adding the facet.
|
||||||
/// value (Facet): The Facet that will be added to the document.
|
/// value (Facet): The Facet that will be added to the document.
|
||||||
fn add_facet(&mut self, field: &Field, value: &Facet) {
|
fn add_facet(&mut self, field_name: String, facet: &Facet) {
|
||||||
self.inner.add_facet(field.inner, value.inner.clone());
|
add_value(self, field_name, facet.inner.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a bytes value to the document.
|
/// Add a bytes value to the document.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// field (Field): The field for which we are adding the bytes.
|
/// field_name (str): The field for which we are adding the bytes.
|
||||||
/// value (bytes): The bytes that will be added to the document.
|
/// value (bytes): The bytes that will be added to the document.
|
||||||
fn add_bytes(&mut self, field: &Field, value: Vec<u8>) {
|
fn add_bytes(&mut self, field_name: String, bytes: Vec<u8>) {
|
||||||
self.inner.add_bytes(field.inner, value);
|
add_value(self, field_name, bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of added fields that have been added to the document
|
/// Returns the number of added fields that have been added to the document
|
||||||
#[getter]
|
#[getter]
|
||||||
fn len(&self) -> usize {
|
fn num_fields(&self) -> usize {
|
||||||
self.inner.len()
|
self.field_values.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// True if the document is empty, False otherwise.
|
/// True if the document is empty, False otherwise.
|
||||||
#[getter]
|
#[getter]
|
||||||
fn is_empty(&self) -> bool {
|
fn is_empty(&self) -> bool {
|
||||||
self.inner.is_empty()
|
self.field_values.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the first value associated with the given field.
|
/// Get the first value associated with the given field.
|
||||||
|
@ -122,9 +301,17 @@ impl Document {
|
||||||
///
|
///
|
||||||
/// Returns the value if one is found, otherwise None.
|
/// Returns the value if one is found, otherwise None.
|
||||||
/// The type of the value depends on the field.
|
/// The type of the value depends on the field.
|
||||||
fn get_first(&self, py: Python, field: &Field) -> Option<PyObject> {
|
fn get_first(
|
||||||
let value = self.inner.get_first(field.inner)?;
|
&self,
|
||||||
FieldValue::value_to_py(py, value)
|
py: Python,
|
||||||
|
fieldname: &str,
|
||||||
|
) -> PyResult<Option<PyObject>> {
|
||||||
|
if let Some(value) = self.iter_values_for_field(fieldname).next() {
|
||||||
|
let py_value = value_to_py(py, value)?;
|
||||||
|
Ok(Some(py_value))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the all values associated with the given field.
|
/// Get the all values associated with the given field.
|
||||||
|
@ -134,21 +321,37 @@ impl Document {
|
||||||
///
|
///
|
||||||
/// Returns a list of values.
|
/// Returns a list of values.
|
||||||
/// The type of the value depends on the field.
|
/// The type of the value depends on the field.
|
||||||
fn get_all(&self, py: Python, field: &Field) -> Vec<PyObject> {
|
fn get_all(&self, py: Python, field_name: &str) -> PyResult<Vec<PyObject>> {
|
||||||
let values = self.inner.get_all(field.inner);
|
self.iter_values_for_field(field_name)
|
||||||
values
|
.map(|value| value_to_py(py, value))
|
||||||
.iter()
|
.collect::<PyResult<Vec<_>>>()
|
||||||
.map(|&v| FieldValue::value_to_py(py, v))
|
}
|
||||||
.filter_map(|x| x)
|
}
|
||||||
.collect()
|
|
||||||
}
|
impl Document {
|
||||||
|
fn iter_values_for_field<'a>(
|
||||||
/// Get all the fields and values contained in the document.
|
&'a self,
|
||||||
fn field_values(&self, py: Python) -> Vec<FieldValue> {
|
field: &str,
|
||||||
let field_values = self.inner.field_values();
|
) -> impl Iterator<Item = &'a Value> + 'a {
|
||||||
field_values
|
self.field_values
|
||||||
.iter()
|
.get(field)
|
||||||
.map(|v| FieldValue::field_value_to_py(py, v))
|
.into_iter()
|
||||||
.collect()
|
.flat_map(|values| values.iter())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyproto]
|
||||||
|
impl PyMappingProtocol for Document {
|
||||||
|
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
|
||||||
|
let gil = Python::acquire_gil();
|
||||||
|
let py = gil.python();
|
||||||
|
self.get_all(py, field_name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyproto]
|
||||||
|
impl PyObjectProtocol for Document {
|
||||||
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
|
Ok(format!("{:?}", self))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
21
src/facet.rs
21
src/facet.rs
|
@ -1,6 +1,6 @@
|
||||||
|
use pyo3::basic::PyObjectProtocol;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::PyType;
|
use pyo3::types::PyType;
|
||||||
|
|
||||||
use tantivy::schema;
|
use tantivy::schema;
|
||||||
|
|
||||||
/// A Facet represent a point in a given hierarchy.
|
/// A Facet represent a point in a given hierarchy.
|
||||||
|
@ -52,4 +52,23 @@ impl Facet {
|
||||||
inner: schema::Facet::from_text(facet_string),
|
inner: schema::Facet::from_text(facet_string),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the list of `segments` that forms a facet path.
|
||||||
|
///
|
||||||
|
/// For instance `//europe/france` becomes `["europe", "france"]`.
|
||||||
|
fn to_path(&self) -> Vec<&str> {
|
||||||
|
self.inner.to_path()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the facet string representation.
|
||||||
|
fn to_path_str(&self) -> String {
|
||||||
|
self.inner.to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyproto]
|
||||||
|
impl PyObjectProtocol for Facet {
|
||||||
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
|
Ok(format!("Facet({})", self.to_path_str()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
73
src/field.rs
73
src/field.rs
|
@ -1,73 +0,0 @@
|
||||||
use pyo3::prelude::*;
|
|
||||||
use pyo3::types::PyDateTime;
|
|
||||||
|
|
||||||
use tantivy::schema;
|
|
||||||
|
|
||||||
use crate::facet::Facet;
|
|
||||||
|
|
||||||
/// Field is a numeric indentifier that represents an entry in the Schema.
|
|
||||||
#[pyclass]
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct Field {
|
|
||||||
pub(crate) inner: schema::Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// FieldValue holds together a Field and its Value.
|
|
||||||
#[pyclass]
|
|
||||||
pub(crate) struct FieldValue {
|
|
||||||
pub(crate) field: Field,
|
|
||||||
pub(crate) value: PyObject,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[pymethods]
|
|
||||||
impl FieldValue {
|
|
||||||
#[getter]
|
|
||||||
fn field(&self) -> Field {
|
|
||||||
self.field.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[getter]
|
|
||||||
fn value(&self) -> &PyObject {
|
|
||||||
&self.value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FieldValue {
|
|
||||||
pub(crate) fn value_to_py(
|
|
||||||
py: Python,
|
|
||||||
value: &schema::Value,
|
|
||||||
) -> Option<PyObject> {
|
|
||||||
match value {
|
|
||||||
schema::Value::Str(text) => Some(text.into_object(py)),
|
|
||||||
schema::Value::U64(num) => Some(num.into_object(py)),
|
|
||||||
schema::Value::I64(num) => Some(num.into_object(py)),
|
|
||||||
schema::Value::F64(num) => Some(num.into_object(py)),
|
|
||||||
schema::Value::Bytes(b) => Some(b.to_object(py)),
|
|
||||||
schema::Value::Date(d) => {
|
|
||||||
let date =
|
|
||||||
PyDateTime::from_timestamp(py, d.timestamp() as f64, None);
|
|
||||||
|
|
||||||
match date {
|
|
||||||
Ok(d) => Some(d.into_object(py)),
|
|
||||||
Err(_e) => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
schema::Value::Facet(f) => {
|
|
||||||
Some(Facet { inner: f.clone() }.into_object(py))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn field_value_to_py(
|
|
||||||
py: Python,
|
|
||||||
field_value: &schema::FieldValue,
|
|
||||||
) -> FieldValue {
|
|
||||||
let value = field_value.value();
|
|
||||||
let field = field_value.field();
|
|
||||||
|
|
||||||
FieldValue {
|
|
||||||
field: Field { inner: field },
|
|
||||||
value: FieldValue::value_to_py(py, value).unwrap(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
253
src/index.rs
253
src/index.rs
|
@ -4,60 +4,24 @@ use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
use crate::document::Document;
|
use crate::document::Document;
|
||||||
|
use crate::query::Query;
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::searcher::Searcher;
|
use crate::searcher::Searcher;
|
||||||
|
use crate::to_pyerr;
|
||||||
use tantivy as tv;
|
use tantivy as tv;
|
||||||
use tantivy::directory::MmapDirectory;
|
use tantivy::directory::MmapDirectory;
|
||||||
|
use tantivy::schema::{Field, NamedFieldDocument};
|
||||||
|
|
||||||
const RELOAD_POLICY: &str = "commit";
|
const RELOAD_POLICY: &str = "commit";
|
||||||
|
|
||||||
/// IndexReader is the entry point to read and search the index.
|
|
||||||
///
|
|
||||||
/// IndexReader controls when a new version of the index should be loaded and
|
|
||||||
/// lends you instances of Searcher for the last loaded version.
|
|
||||||
///
|
|
||||||
/// To create an IndexReader first create an Index and call the reader() method
|
|
||||||
/// on the index object.
|
|
||||||
#[pyclass]
|
|
||||||
pub(crate) struct IndexReader {
|
|
||||||
inner: tv::IndexReader,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[pymethods]
|
|
||||||
impl IndexReader {
|
|
||||||
/// Update searchers so that they reflect the state of the last .commit().
|
|
||||||
///
|
|
||||||
/// If you set up the the reload policy to be on 'commit' (which is the
|
|
||||||
/// default) every commit should be rapidly reflected on your IndexReader
|
|
||||||
/// and you should not need to call reload() at all.
|
|
||||||
fn reload(&self) -> PyResult<()> {
|
|
||||||
let ret = self.inner.reload();
|
|
||||||
match ret {
|
|
||||||
Ok(_) => Ok(()),
|
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get a Searcher for the index.
|
|
||||||
///
|
|
||||||
/// This method should be called every single time a search query is
|
|
||||||
/// performed. The searchers are taken from a pool of num_searchers
|
|
||||||
/// searchers.
|
|
||||||
///
|
|
||||||
/// Returns a Searcher object, if no searcher is available this may block.
|
|
||||||
fn searcher(&self) -> Searcher {
|
|
||||||
let searcher = self.inner.searcher();
|
|
||||||
Searcher { inner: searcher }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// IndexWriter is the user entry-point to add documents to the index.
|
/// IndexWriter is the user entry-point to add documents to the index.
|
||||||
///
|
///
|
||||||
/// To create an IndexWriter first create an Index and call the writer() method
|
/// To create an IndexWriter first create an Index and call the writer() method
|
||||||
/// on the index object.
|
/// on the index object.
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub(crate) struct IndexWriter {
|
pub(crate) struct IndexWriter {
|
||||||
inner: tv::IndexWriter,
|
inner_index_writer: tv::IndexWriter,
|
||||||
|
schema: tv::schema::Schema,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -70,9 +34,24 @@ impl IndexWriter {
|
||||||
/// by the client to align commits with its own document queue.
|
/// by the client to align commits with its own document queue.
|
||||||
/// The `opstamp` represents the number of documents that have been added
|
/// The `opstamp` represents the number of documents that have been added
|
||||||
/// since the creation of the index.
|
/// since the creation of the index.
|
||||||
fn add_document(&mut self, document: &Document) -> PyResult<()> {
|
pub fn add_document(&mut self, doc: &Document) -> PyResult<u64> {
|
||||||
self.inner.add_document(document.inner.clone());
|
let named_doc = NamedFieldDocument(doc.field_values.clone());
|
||||||
Ok(())
|
let doc = self.schema.convert_named_doc(named_doc).map_err(to_pyerr)?;
|
||||||
|
Ok(self.inner_index_writer.add_document(doc))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper for the `add_document` method, but passing a json string.
|
||||||
|
///
|
||||||
|
/// If the indexing pipeline is full, this call may block.
|
||||||
|
///
|
||||||
|
/// Returns an `opstamp`, which is an increasing integer that can be used
|
||||||
|
/// by the client to align commits with its own document queue.
|
||||||
|
/// The `opstamp` represents the number of documents that have been added
|
||||||
|
/// since the creation of the index.
|
||||||
|
pub fn add_json(&mut self, json: &str) -> PyResult<u64> {
|
||||||
|
let doc = self.schema.parse_document(json).map_err(to_pyerr)?;
|
||||||
|
let opstamp = self.inner_index_writer.add_document(doc);
|
||||||
|
Ok(opstamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Commits all of the pending changes
|
/// Commits all of the pending changes
|
||||||
|
@ -84,12 +63,8 @@ impl IndexWriter {
|
||||||
/// spared), it will be possible to resume indexing from this point.
|
/// spared), it will be possible to resume indexing from this point.
|
||||||
///
|
///
|
||||||
/// Returns the `opstamp` of the last document that made it in the commit.
|
/// Returns the `opstamp` of the last document that made it in the commit.
|
||||||
fn commit(&mut self) -> PyResult<()> {
|
fn commit(&mut self) -> PyResult<u64> {
|
||||||
let ret = self.inner.commit();
|
self.inner_index_writer.commit().map_err(to_pyerr)
|
||||||
match ret {
|
|
||||||
Ok(_) => Ok(()),
|
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Rollback to the last commit
|
/// Rollback to the last commit
|
||||||
|
@ -97,23 +72,15 @@ impl IndexWriter {
|
||||||
/// This cancels all of the update that happened before after the last
|
/// This cancels all of the update that happened before after the last
|
||||||
/// commit. After calling rollback, the index is in the same state as it
|
/// commit. After calling rollback, the index is in the same state as it
|
||||||
/// was after the last commit.
|
/// was after the last commit.
|
||||||
fn rollback(&mut self) -> PyResult<()> {
|
fn rollback(&mut self) -> PyResult<u64> {
|
||||||
let ret = self.inner.rollback();
|
self.inner_index_writer.rollback().map_err(to_pyerr)
|
||||||
|
|
||||||
match ret {
|
|
||||||
Ok(_) => Ok(()),
|
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Detect and removes the files that are not used by the index anymore.
|
/// Detect and removes the files that are not used by the index anymore.
|
||||||
fn garbage_collect_files(&mut self) -> PyResult<()> {
|
fn garbage_collect_files(&mut self) -> PyResult<()> {
|
||||||
let ret = self.inner.garbage_collect_files();
|
self.inner_index_writer
|
||||||
|
.garbage_collect_files()
|
||||||
match ret {
|
.map_err(to_pyerr)
|
||||||
Ok(_) => Ok(()),
|
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The opstamp of the last successful commit.
|
/// The opstamp of the last successful commit.
|
||||||
|
@ -125,7 +92,7 @@ impl IndexWriter {
|
||||||
/// for searchers.
|
/// for searchers.
|
||||||
#[getter]
|
#[getter]
|
||||||
fn commit_opstamp(&self) -> u64 {
|
fn commit_opstamp(&self) -> u64 {
|
||||||
self.inner.commit_opstamp()
|
self.inner_index_writer.commit_opstamp()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -142,11 +109,19 @@ impl IndexWriter {
|
||||||
/// if there was a problem during the opening or creation of the index.
|
/// if there was a problem during the opening or creation of the index.
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub(crate) struct Index {
|
pub(crate) struct Index {
|
||||||
pub(crate) inner: tv::Index,
|
pub(crate) index: tv::Index,
|
||||||
|
reader: tv::IndexReader,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Index {
|
impl Index {
|
||||||
|
#[staticmethod]
|
||||||
|
fn open(path: &str) -> PyResult<Index> {
|
||||||
|
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
|
||||||
|
let reader = index.reader().map_err(to_pyerr)?;
|
||||||
|
Ok(Index { index, reader })
|
||||||
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(reuse = true)]
|
#[args(reuse = true)]
|
||||||
fn new(
|
fn new(
|
||||||
|
@ -157,32 +132,19 @@ impl Index {
|
||||||
) -> PyResult<()> {
|
) -> PyResult<()> {
|
||||||
let index = match path {
|
let index = match path {
|
||||||
Some(p) => {
|
Some(p) => {
|
||||||
let directory = MmapDirectory::open(p);
|
let directory = MmapDirectory::open(p).map_err(to_pyerr)?;
|
||||||
|
if reuse {
|
||||||
let dir = match directory {
|
tv::Index::open_or_create(directory, schema.inner.clone())
|
||||||
Ok(d) => d,
|
|
||||||
Err(e) => {
|
|
||||||
return Err(exceptions::OSError::py_err(e.to_string()))
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let i = if reuse {
|
|
||||||
tv::Index::open_or_create(dir, schema.inner.clone())
|
|
||||||
} else {
|
} else {
|
||||||
tv::Index::create(dir, schema.inner.clone())
|
tv::Index::create(directory, schema.inner.clone())
|
||||||
};
|
|
||||||
|
|
||||||
match i {
|
|
||||||
Ok(index) => index,
|
|
||||||
Err(e) => {
|
|
||||||
return Err(exceptions::OSError::py_err(e.to_string()))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
.map_err(to_pyerr)?
|
||||||
}
|
}
|
||||||
None => tv::Index::create_in_ram(schema.inner.clone()),
|
None => tv::Index::create_in_ram(schema.inner.clone()),
|
||||||
};
|
};
|
||||||
|
|
||||||
obj.init(Index { inner: index });
|
let reader = index.reader().map_err(to_pyerr)?;
|
||||||
|
obj.init(Index { index, reader });
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -206,32 +168,30 @@ impl Index {
|
||||||
num_threads: usize,
|
num_threads: usize,
|
||||||
) -> PyResult<IndexWriter> {
|
) -> PyResult<IndexWriter> {
|
||||||
let writer = match num_threads {
|
let writer = match num_threads {
|
||||||
0 => self.inner.writer(heap_size),
|
0 => self.index.writer(heap_size),
|
||||||
_ => self.inner.writer_with_num_threads(num_threads, heap_size),
|
_ => self.index.writer_with_num_threads(num_threads, heap_size),
|
||||||
};
|
|
||||||
|
|
||||||
match writer {
|
|
||||||
Ok(w) => Ok(IndexWriter { inner: w }),
|
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
}
|
}
|
||||||
|
.map_err(to_pyerr)?;
|
||||||
|
let schema = self.index.schema();
|
||||||
|
Ok(IndexWriter {
|
||||||
|
inner_index_writer: writer,
|
||||||
|
schema,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create an IndexReader for the index.
|
/// Configure the index reader.
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// reload_policy (str, optional): The reload policy that the
|
/// reload_policy (str, optional): The reload policy that the
|
||||||
/// IndexReader should use. Can be manual or OnCommit.
|
/// IndexReader should use. Can be `Manual` or `OnCommit`.
|
||||||
/// num_searchers (int, optional): The number of searchers that the
|
/// num_searchers (int, optional): The number of searchers that the
|
||||||
/// reader should create.
|
/// reader should create.
|
||||||
///
|
|
||||||
/// Returns the IndexReader on success, raises ValueError if a IndexReader
|
|
||||||
/// couldn't be created.
|
|
||||||
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
|
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
|
||||||
fn reader(
|
fn config_reader(
|
||||||
&self,
|
&mut self,
|
||||||
reload_policy: &str,
|
reload_policy: &str,
|
||||||
num_searchers: usize,
|
num_searchers: usize,
|
||||||
) -> PyResult<IndexReader> {
|
) -> Result<(), PyErr> {
|
||||||
let reload_policy = reload_policy.to_lowercase();
|
let reload_policy = reload_policy.to_lowercase();
|
||||||
let reload_policy = match reload_policy.as_ref() {
|
let reload_policy = match reload_policy.as_ref() {
|
||||||
"commit" => tv::ReloadPolicy::OnCommit,
|
"commit" => tv::ReloadPolicy::OnCommit,
|
||||||
|
@ -242,9 +202,7 @@ impl Index {
|
||||||
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
|
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
|
||||||
))
|
))
|
||||||
};
|
};
|
||||||
|
let builder = self.index.reader_builder();
|
||||||
let builder = self.inner.reader_builder();
|
|
||||||
|
|
||||||
let builder = builder.reload_policy(reload_policy);
|
let builder = builder.reload_policy(reload_policy);
|
||||||
let builder = if num_searchers > 0 {
|
let builder = if num_searchers > 0 {
|
||||||
builder.num_searchers(num_searchers)
|
builder.num_searchers(num_searchers)
|
||||||
|
@ -252,10 +210,23 @@ impl Index {
|
||||||
builder
|
builder
|
||||||
};
|
};
|
||||||
|
|
||||||
let reader = builder.try_into();
|
self.reader = builder.try_into().map_err(to_pyerr)?;
|
||||||
match reader {
|
Ok(())
|
||||||
Ok(r) => Ok(IndexReader { inner: r }),
|
}
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
|
/// Acquires a Searcher from the searcher pool.
|
||||||
|
///
|
||||||
|
/// If no searcher is available during the call, note that
|
||||||
|
/// this call will block until one is made available.
|
||||||
|
///
|
||||||
|
/// Searcher are automatically released back into the pool when
|
||||||
|
/// they are dropped. If you observe this function to block forever
|
||||||
|
/// you probably should configure the Index to have a larger
|
||||||
|
/// searcher pool, or you are holding references to previous searcher
|
||||||
|
/// for ever.
|
||||||
|
fn searcher(&self) -> Searcher {
|
||||||
|
Searcher {
|
||||||
|
inner: self.reader.searcher(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -268,19 +239,71 @@ impl Index {
|
||||||
/// Raises OSError if the directory cannot be opened.
|
/// Raises OSError if the directory cannot be opened.
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn exists(path: &str) -> PyResult<bool> {
|
fn exists(path: &str) -> PyResult<bool> {
|
||||||
let directory = MmapDirectory::open(path);
|
let directory = MmapDirectory::open(path).map_err(to_pyerr)?;
|
||||||
let dir = match directory {
|
Ok(tv::Index::exists(&directory))
|
||||||
Ok(d) => d,
|
|
||||||
Err(e) => return Err(exceptions::OSError::py_err(e.to_string())),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(tv::Index::exists(&dir))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The schema of the current index.
|
/// The schema of the current index.
|
||||||
#[getter]
|
#[getter]
|
||||||
fn schema(&self) -> Schema {
|
fn schema(&self) -> Schema {
|
||||||
let schema = self.inner.schema();
|
let schema = self.index.schema();
|
||||||
Schema { inner: schema }
|
Schema { inner: schema }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Update searchers so that they reflect the state of the last .commit().
|
||||||
|
///
|
||||||
|
/// If you set up the the reload policy to be on 'commit' (which is the
|
||||||
|
/// default) every commit should be rapidly reflected on your IndexReader
|
||||||
|
/// and you should not need to call reload() at all.
|
||||||
|
fn reload(&self) -> PyResult<()> {
|
||||||
|
self.reader.reload().map_err(to_pyerr)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a query
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// query: the query, following the tantivy query language.
|
||||||
|
/// default_fields (List[Field]): A list of fields used to search if no
|
||||||
|
/// field is specified in the query.
|
||||||
|
///
|
||||||
|
#[args(reload_policy = "RELOAD_POLICY")]
|
||||||
|
pub fn parse_query(
|
||||||
|
&self,
|
||||||
|
query: &str,
|
||||||
|
default_field_names: Option<Vec<String>>,
|
||||||
|
) -> PyResult<Query> {
|
||||||
|
let mut default_fields = vec![];
|
||||||
|
let schema = self.index.schema();
|
||||||
|
if let Some(default_field_names_vec) = default_field_names {
|
||||||
|
for default_field_name in &default_field_names_vec {
|
||||||
|
if let Some(field) = schema.get_field(default_field_name) {
|
||||||
|
let field_entry = schema.get_field_entry(field);
|
||||||
|
if !field_entry.is_indexed() {
|
||||||
|
return Err(exceptions::ValueError::py_err(format!(
|
||||||
|
"Field `{}` is not set as indexed in the schema.",
|
||||||
|
default_field_name
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
default_fields.push(field);
|
||||||
|
} else {
|
||||||
|
return Err(exceptions::ValueError::py_err(format!(
|
||||||
|
"Field `{}` is not defined in the schema.",
|
||||||
|
default_field_name
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (field_id, field_entry) in
|
||||||
|
self.index.schema().fields().iter().enumerate()
|
||||||
|
{
|
||||||
|
if field_entry.is_indexed() {
|
||||||
|
default_fields.push(Field(field_id as u32));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let parser =
|
||||||
|
tv::query::QueryParser::for_index(&self.index, default_fields);
|
||||||
|
let query = parser.parse_query(query).map_err(to_pyerr)?;
|
||||||
|
Ok(Query { inner: query })
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
14
src/lib.rs
14
src/lib.rs
|
@ -1,8 +1,8 @@
|
||||||
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
mod document;
|
mod document;
|
||||||
mod facet;
|
mod facet;
|
||||||
mod field;
|
|
||||||
mod index;
|
mod index;
|
||||||
mod query;
|
mod query;
|
||||||
mod schema;
|
mod schema;
|
||||||
|
@ -11,9 +11,7 @@ mod searcher;
|
||||||
|
|
||||||
use document::Document;
|
use document::Document;
|
||||||
use facet::Facet;
|
use facet::Facet;
|
||||||
use field::{Field, FieldValue};
|
|
||||||
use index::Index;
|
use index::Index;
|
||||||
use query::QueryParser;
|
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schemabuilder::SchemaBuilder;
|
use schemabuilder::SchemaBuilder;
|
||||||
use searcher::{DocAddress, Searcher, TopDocs};
|
use searcher::{DocAddress, Searcher, TopDocs};
|
||||||
|
@ -75,14 +73,14 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||||
m.add_class::<Schema>()?;
|
m.add_class::<Schema>()?;
|
||||||
m.add_class::<SchemaBuilder>()?;
|
m.add_class::<SchemaBuilder>()?;
|
||||||
m.add_class::<Searcher>()?;
|
m.add_class::<Searcher>()?;
|
||||||
m.add_class::<Index>()?;
|
|
||||||
m.add_class::<QueryParser>()?;
|
|
||||||
m.add_class::<Document>()?;
|
m.add_class::<Document>()?;
|
||||||
|
m.add_class::<Index>()?;
|
||||||
m.add_class::<DocAddress>()?;
|
m.add_class::<DocAddress>()?;
|
||||||
m.add_class::<TopDocs>()?;
|
m.add_class::<TopDocs>()?;
|
||||||
m.add_class::<Field>()?;
|
|
||||||
m.add_class::<FieldValue>()?;
|
|
||||||
m.add_class::<Facet>()?;
|
m.add_class::<Facet>()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn to_pyerr<E: ToString>(err: E) -> PyErr {
|
||||||
|
exceptions::ValueError::py_err(err.to_string())
|
||||||
|
}
|
||||||
|
|
64
src/query.rs
64
src/query.rs
|
@ -1,70 +1,16 @@
|
||||||
use pyo3::exceptions;
|
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::PyType;
|
use pyo3::PyObjectProtocol;
|
||||||
|
|
||||||
use tantivy as tv;
|
use tantivy as tv;
|
||||||
|
|
||||||
use crate::field::Field;
|
|
||||||
use crate::index::Index;
|
|
||||||
|
|
||||||
/// Tantivy's Query
|
/// Tantivy's Query
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub(crate) struct Query {
|
pub(crate) struct Query {
|
||||||
pub(crate) inner: Box<dyn tv::query::Query>,
|
pub(crate) inner: Box<dyn tv::query::Query>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Tantivy's Query parser
|
#[pyproto]
|
||||||
#[pyclass]
|
impl PyObjectProtocol for Query {
|
||||||
pub(crate) struct QueryParser {
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
inner: tv::query::QueryParser,
|
Ok(format!("Query({:?})", self.inner))
|
||||||
}
|
|
||||||
|
|
||||||
#[pymethods]
|
|
||||||
impl QueryParser {
|
|
||||||
/// Creates a QueryParser for an Index.
|
|
||||||
///
|
|
||||||
/// Args:
|
|
||||||
/// index (Index): The index for which the query will be created.
|
|
||||||
/// default_fields (List[Field]): A list of fields used to search if no
|
|
||||||
/// field is specified in the query.
|
|
||||||
///
|
|
||||||
/// Returns the QueryParser.
|
|
||||||
#[classmethod]
|
|
||||||
fn for_index(
|
|
||||||
_cls: &PyType,
|
|
||||||
index: &Index,
|
|
||||||
default_fields: Vec<&Field>,
|
|
||||||
) -> PyResult<QueryParser> {
|
|
||||||
let default_fields: Vec<tv::schema::Field> =
|
|
||||||
default_fields.iter().map(|&f| f.inner).collect();
|
|
||||||
|
|
||||||
let parser =
|
|
||||||
tv::query::QueryParser::for_index(&index.inner, default_fields);
|
|
||||||
Ok(QueryParser { inner: parser })
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse a string into a query that can be given to a searcher.
|
|
||||||
///
|
|
||||||
/// Args:
|
|
||||||
/// query (str): A query string that should be parsed into a query.
|
|
||||||
///
|
|
||||||
/// Returns the parsed Query object. Raises ValueError if there was an
|
|
||||||
/// error with the query string.
|
|
||||||
fn parse_query(&self, query: &str) -> PyResult<Query> {
|
|
||||||
let ret = self.inner.parse_query(query);
|
|
||||||
|
|
||||||
match ret {
|
|
||||||
Ok(q) => Ok(Query { inner: q }),
|
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the default way to compose queries to a conjunction.
|
|
||||||
///
|
|
||||||
/// By default, the query happy tax payer is equivalent to the query happy
|
|
||||||
/// OR tax OR payer. After calling .set_conjunction_by_default() happy tax
|
|
||||||
/// payer will be interpreted by the parser as happy AND tax AND payer.
|
|
||||||
fn set_conjunction_by_default(&mut self) {
|
|
||||||
self.inner.set_conjunction_by_default();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,5 @@
|
||||||
use pyo3::exceptions;
|
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
|
use tantivy as tv;
|
||||||
use tantivy::schema;
|
|
||||||
|
|
||||||
use crate::document::Document;
|
|
||||||
use crate::field::Field;
|
|
||||||
|
|
||||||
/// Tantivy schema.
|
/// Tantivy schema.
|
||||||
///
|
///
|
||||||
|
@ -12,54 +7,8 @@ use crate::field::Field;
|
||||||
/// provided.
|
/// provided.
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub(crate) struct Schema {
|
pub(crate) struct Schema {
|
||||||
pub(crate) inner: schema::Schema,
|
pub(crate) inner: tv::schema::Schema,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Schema {
|
impl Schema {}
|
||||||
/// Build a document object from a json string.
|
|
||||||
///
|
|
||||||
/// Args:
|
|
||||||
/// doc_json (str) - A string containing json that should be parsed
|
|
||||||
/// into a `Document`
|
|
||||||
///
|
|
||||||
/// Returns the parsed document, raises a ValueError if the parsing failed.
|
|
||||||
fn parse_document(&self, doc_json: &str) -> PyResult<Document> {
|
|
||||||
let ret = self.inner.parse_document(doc_json);
|
|
||||||
match ret {
|
|
||||||
Ok(d) => Ok(Document { inner: d }),
|
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convert a `Document` object into a json string.
|
|
||||||
///
|
|
||||||
/// Args:
|
|
||||||
/// doc (Document): The document that will be converted into a json
|
|
||||||
/// string.
|
|
||||||
fn to_json(&self, doc: &Document) -> String {
|
|
||||||
self.inner.to_json(&doc.inner)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the field name for a given `Field`.
|
|
||||||
///
|
|
||||||
/// Args:
|
|
||||||
/// field (Field): The field for which the name will be returned.
|
|
||||||
fn get_field_name(&self, field: &Field) -> &str {
|
|
||||||
self.inner.get_field_name(field.inner)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the field option associated with a given name.
|
|
||||||
///
|
|
||||||
/// Args:
|
|
||||||
/// name (str): The name of the field that we want to retrieve.
|
|
||||||
///
|
|
||||||
/// Returns the Field if one is found, None otherwise.
|
|
||||||
fn get_field(&self, name: &str) -> Option<Field> {
|
|
||||||
let f = self.inner.get_field(name);
|
|
||||||
match f {
|
|
||||||
Some(field) => Some(Field { inner: field }),
|
|
||||||
None => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -5,8 +5,8 @@ use pyo3::prelude::*;
|
||||||
|
|
||||||
use tantivy::schema;
|
use tantivy::schema;
|
||||||
|
|
||||||
use crate::field::Field;
|
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
/// Tantivy has a very strict schema.
|
/// Tantivy has a very strict schema.
|
||||||
/// You need to specify in advance whether a field is indexed or not,
|
/// You need to specify in advance whether a field is indexed or not,
|
||||||
|
@ -24,8 +24,9 @@ use crate::schema::Schema;
|
||||||
///
|
///
|
||||||
/// >>> schema = builder.build()
|
/// >>> schema = builder.build()
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
|
#[derive(Clone)]
|
||||||
pub(crate) struct SchemaBuilder {
|
pub(crate) struct SchemaBuilder {
|
||||||
pub(crate) builder: Option<schema::SchemaBuilder>,
|
pub(crate) builder: Arc<RwLock<Option<schema::SchemaBuilder>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
const TOKENIZER: &str = "default";
|
const TOKENIZER: &str = "default";
|
||||||
|
@ -36,7 +37,7 @@ impl SchemaBuilder {
|
||||||
#[new]
|
#[new]
|
||||||
fn new(obj: &PyRawObject) {
|
fn new(obj: &PyRawObject) {
|
||||||
obj.init(SchemaBuilder {
|
obj.init(SchemaBuilder {
|
||||||
builder: Some(schema::Schema::builder()),
|
builder: Arc::new(From::from(Some(schema::Schema::builder()))),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -70,9 +71,8 @@ impl SchemaBuilder {
|
||||||
stored: bool,
|
stored: bool,
|
||||||
tokenizer_name: &str,
|
tokenizer_name: &str,
|
||||||
index_option: &str,
|
index_option: &str,
|
||||||
) -> PyResult<Field> {
|
) -> PyResult<Self> {
|
||||||
let builder = &mut self.builder;
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
let index_option = match index_option {
|
let index_option = match index_option {
|
||||||
"position" => schema::IndexRecordOption::WithFreqsAndPositions,
|
"position" => schema::IndexRecordOption::WithFreqsAndPositions,
|
||||||
"freq" => schema::IndexRecordOption::WithFreqs,
|
"freq" => schema::IndexRecordOption::WithFreqs,
|
||||||
|
@ -94,14 +94,14 @@ impl SchemaBuilder {
|
||||||
options
|
options
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(builder) = builder {
|
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||||
let field = builder.add_text_field(name, options);
|
builder.add_text_field(name, options);
|
||||||
Ok(Field { inner: field })
|
|
||||||
} else {
|
} else {
|
||||||
Err(exceptions::ValueError::py_err(
|
return Err(exceptions::ValueError::py_err(
|
||||||
"Schema builder object isn't valid anymore.",
|
"Schema builder object isn't valid anymore.",
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
|
Ok(self.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a new signed integer field to the schema.
|
/// Add a new signed integer field to the schema.
|
||||||
|
@ -131,19 +131,19 @@ impl SchemaBuilder {
|
||||||
stored: bool,
|
stored: bool,
|
||||||
indexed: bool,
|
indexed: bool,
|
||||||
fast: Option<&str>,
|
fast: Option<&str>,
|
||||||
) -> PyResult<Field> {
|
) -> PyResult<Self> {
|
||||||
let builder = &mut self.builder;
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||||
|
|
||||||
if let Some(builder) = builder {
|
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||||
let field = builder.add_i64_field(name, opts);
|
builder.add_i64_field(name, opts);
|
||||||
Ok(Field { inner: field })
|
|
||||||
} else {
|
} else {
|
||||||
Err(exceptions::ValueError::py_err(
|
return Err(exceptions::ValueError::py_err(
|
||||||
"Schema builder object isn't valid anymore.",
|
"Schema builder object isn't valid anymore.",
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
|
Ok(self.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a new unsigned integer field to the schema.
|
/// Add a new unsigned integer field to the schema.
|
||||||
|
@ -173,19 +173,19 @@ impl SchemaBuilder {
|
||||||
stored: bool,
|
stored: bool,
|
||||||
indexed: bool,
|
indexed: bool,
|
||||||
fast: Option<&str>,
|
fast: Option<&str>,
|
||||||
) -> PyResult<Field> {
|
) -> PyResult<Self> {
|
||||||
let builder = &mut self.builder;
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||||
|
|
||||||
if let Some(builder) = builder {
|
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||||
let field = builder.add_u64_field(name, opts);
|
builder.add_u64_field(name, opts);
|
||||||
Ok(Field { inner: field })
|
|
||||||
} else {
|
} else {
|
||||||
Err(exceptions::ValueError::py_err(
|
return Err(exceptions::ValueError::py_err(
|
||||||
"Schema builder object isn't valid anymore.",
|
"Schema builder object isn't valid anymore.",
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
|
Ok(self.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a new date field to the schema.
|
/// Add a new date field to the schema.
|
||||||
|
@ -215,35 +215,35 @@ impl SchemaBuilder {
|
||||||
stored: bool,
|
stored: bool,
|
||||||
indexed: bool,
|
indexed: bool,
|
||||||
fast: Option<&str>,
|
fast: Option<&str>,
|
||||||
) -> PyResult<Field> {
|
) -> PyResult<Self> {
|
||||||
let builder = &mut self.builder;
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||||
|
|
||||||
if let Some(builder) = builder {
|
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||||
let field = builder.add_date_field(name, opts);
|
builder.add_date_field(name, opts);
|
||||||
Ok(Field { inner: field })
|
|
||||||
} else {
|
} else {
|
||||||
Err(exceptions::ValueError::py_err(
|
return Err(exceptions::ValueError::py_err(
|
||||||
"Schema builder object isn't valid anymore.",
|
"Schema builder object isn't valid anymore.",
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
|
Ok(self.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a Facet field to the schema.
|
/// Add a Facet field to the schema.
|
||||||
/// Args:
|
/// Args:
|
||||||
/// name (str): The name of the field.
|
/// name (str): The name of the field.
|
||||||
fn add_facet_field(&mut self, name: &str) -> PyResult<Field> {
|
fn add_facet_field(&mut self, name: &str) -> PyResult<Self> {
|
||||||
let builder = &mut self.builder;
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
if let Some(builder) = builder {
|
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||||
let field = builder.add_facet_field(name);
|
builder.add_facet_field(name);
|
||||||
Ok(Field { inner: field })
|
|
||||||
} else {
|
} else {
|
||||||
Err(exceptions::ValueError::py_err(
|
return Err(exceptions::ValueError::py_err(
|
||||||
"Schema builder object isn't valid anymore.",
|
"Schema builder object isn't valid anymore.",
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
|
Ok(self.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a fast bytes field to the schema.
|
/// Add a fast bytes field to the schema.
|
||||||
|
@ -254,17 +254,17 @@ impl SchemaBuilder {
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// name (str): The name of the field.
|
/// name (str): The name of the field.
|
||||||
fn add_bytes_field(&mut self, name: &str) -> PyResult<Field> {
|
fn add_bytes_field(&mut self, name: &str) -> PyResult<Self> {
|
||||||
let builder = &mut self.builder;
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
if let Some(builder) = builder {
|
if let Some(builder) = builder.write().unwrap().as_mut() {
|
||||||
let field = builder.add_bytes_field(name);
|
builder.add_bytes_field(name);
|
||||||
Ok(Field { inner: field })
|
|
||||||
} else {
|
} else {
|
||||||
Err(exceptions::ValueError::py_err(
|
return Err(exceptions::ValueError::py_err(
|
||||||
"Schema builder object isn't valid anymore.",
|
"Schema builder object isn't valid anymore.",
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
|
Ok(self.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Finalize the creation of a Schema.
|
/// Finalize the creation of a Schema.
|
||||||
|
@ -272,7 +272,7 @@ impl SchemaBuilder {
|
||||||
/// Returns a Schema object. After this is called the SchemaBuilder cannot
|
/// Returns a Schema object. After this is called the SchemaBuilder cannot
|
||||||
/// be used anymore.
|
/// be used anymore.
|
||||||
fn build(&mut self) -> PyResult<Schema> {
|
fn build(&mut self) -> PyResult<Schema> {
|
||||||
let builder = self.builder.take();
|
let builder = self.builder.write().unwrap().take();
|
||||||
if let Some(builder) = builder {
|
if let Some(builder) = builder {
|
||||||
let schema = builder.build();
|
let schema = builder.build();
|
||||||
Ok(Schema { inner: schema })
|
Ok(Schema { inner: schema })
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
#![allow(clippy::new_ret_no_self)]
|
#![allow(clippy::new_ret_no_self)]
|
||||||
|
|
||||||
use pyo3::exceptions;
|
|
||||||
use pyo3::prelude::*;
|
|
||||||
|
|
||||||
use tantivy as tv;
|
|
||||||
|
|
||||||
use crate::document::Document;
|
use crate::document::Document;
|
||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
|
use crate::to_pyerr;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
use pyo3::{exceptions, PyObjectProtocol};
|
||||||
|
use tantivy as tv;
|
||||||
|
|
||||||
/// Tantivy's Searcher class
|
/// Tantivy's Searcher class
|
||||||
///
|
///
|
||||||
|
@ -60,11 +59,11 @@ impl Searcher {
|
||||||
///
|
///
|
||||||
/// Returns the Document, raises ValueError if the document can't be found.
|
/// Returns the Document, raises ValueError if the document can't be found.
|
||||||
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
|
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
|
||||||
let ret = self.inner.doc(doc_address.into());
|
let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
|
||||||
match ret {
|
let named_doc = self.inner.schema().to_named_doc(&doc);
|
||||||
Ok(doc) => Ok(Document { inner: doc }),
|
Ok(Document {
|
||||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
field_values: named_doc.0,
|
||||||
}
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,3 +131,14 @@ impl TopDocs {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyproto]
|
||||||
|
impl PyObjectProtocol for Searcher {
|
||||||
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
|
Ok(format!(
|
||||||
|
"Searcher(num_docs={}, num_segments={})",
|
||||||
|
self.inner.num_docs(),
|
||||||
|
self.inner.segment_readers().len()
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,192 +1,188 @@
|
||||||
import json
|
|
||||||
import tantivy
|
import tantivy
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from tantivy import Document, Index, SchemaBuilder, Schema
|
||||||
|
|
||||||
|
|
||||||
|
def schema():
|
||||||
|
return SchemaBuilder() \
|
||||||
|
.add_text_field("title", stored=True) \
|
||||||
|
.add_text_field("body") \
|
||||||
|
.build()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
@pytest.fixture(scope="class")
|
||||||
def ram_index():
|
def ram_index():
|
||||||
# assume all tests will use the same documents for now
|
# assume all tests will use the same documents for now
|
||||||
# other methods may set up function-local indexes
|
# other methods may set up function-local indexes
|
||||||
builder = tantivy.SchemaBuilder()
|
index = Index(schema())
|
||||||
|
writer = index.writer()
|
||||||
|
|
||||||
title = builder.add_text_field("title", stored=True)
|
# 2 ways of adding documents
|
||||||
body = builder.add_text_field("body")
|
# 1
|
||||||
|
doc = Document()
|
||||||
schema = builder.build()
|
# create a document instance
|
||||||
index = tantivy.Index(schema)
|
# add field-value pairs
|
||||||
|
doc.add_text("title", "The Old Man and the Sea")
|
||||||
writer = index.writer()
|
doc.add_text("body", ("He was an old man who fished alone in a skiff in"
|
||||||
|
"the Gulf Stream and he had gone eighty-four days "
|
||||||
# 2 ways of adding documents
|
"now without taking a fish."))
|
||||||
# 1
|
writer.add_document(doc)
|
||||||
doc = tantivy.Document()
|
# 2 use the built-in json support
|
||||||
# create a document instance
|
# keys need to coincide with field names
|
||||||
# add field-value pairs
|
doc = Document.from_dict({
|
||||||
doc.add_text(title, "The Old Man and the Sea")
|
"title": "Of Mice and Men",
|
||||||
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
"body": ("A few miles south of Soledad, the Salinas River drops "
|
||||||
"the Gulf Stream and he had gone eighty-four days "
|
"in close to the hillside bank and runs deep and "
|
||||||
"now without taking a fish."))
|
"green. The water is warm too, for it has slipped "
|
||||||
writer.add_document(doc)
|
"twinkling over the yellow sands in the sunlight "
|
||||||
# 2 use the built-in json support
|
"before reaching the narrow pool. On one side of the "
|
||||||
# keys need to coincide with field names
|
"river the golden foothill slopes curve up to the "
|
||||||
doc = schema.parse_document(json.dumps({
|
"strong and rocky Gabilan Mountains, but on the valley "
|
||||||
"title": "Of Mice and Men",
|
"side the water is lined with trees—willows fresh and "
|
||||||
"body": ("A few miles south of Soledad, the Salinas River drops "
|
"green with every spring, carrying in their lower leaf "
|
||||||
"in close to the hillside bank and runs deep and "
|
"junctures the debris of the winter’s flooding; and "
|
||||||
"green. The water is warm too, for it has slipped "
|
"sycamores with mottled, white, recumbent limbs and "
|
||||||
"twinkling over the yellow sands in the sunlight "
|
"branches that arch over the pool")
|
||||||
"before reaching the narrow pool. On one side of the "
|
})
|
||||||
"river the golden foothill slopes curve up to the "
|
writer.add_document(doc)
|
||||||
"strong and rocky Gabilan Mountains, but on the valley "
|
writer.add_json("""{
|
||||||
"side the water is lined with trees—willows fresh and "
|
|
||||||
"green with every spring, carrying in their lower leaf "
|
|
||||||
"junctures the debris of the winter’s flooding; and "
|
|
||||||
"sycamores with mottled, white, recumbent limbs and "
|
|
||||||
"branches that arch over the pool")
|
|
||||||
}))
|
|
||||||
|
|
||||||
writer.add_document(doc)
|
|
||||||
|
|
||||||
doc = schema.parse_document(json.dumps({
|
|
||||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||||
"body": ("You will rejoice to hear that no disaster has "
|
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
||||||
"accompanied the commencement of an enterprise which you "
|
}""")
|
||||||
"have regarded with such evil forebodings. I arrived "
|
writer.commit()
|
||||||
"here yesterday, and my first task is to assure my dear "
|
index.reload()
|
||||||
"sister of my welfare and increasing confidence in the "
|
return index
|
||||||
"success of my undertaking.")
|
|
||||||
}))
|
|
||||||
|
|
||||||
writer.add_document(doc)
|
|
||||||
writer.commit()
|
|
||||||
|
|
||||||
reader = index.reader()
|
|
||||||
searcher = reader.searcher()
|
|
||||||
index = index
|
|
||||||
schema = schema
|
|
||||||
default_args = [title, body]
|
|
||||||
ret = (index, searcher, schema, default_args, title, body)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
class TestClass(object):
|
class TestClass(object):
|
||||||
|
|
||||||
def test_simple_search(self, ram_index):
|
def test_simple_search(self, ram_index):
|
||||||
index, searcher, schema, default_args, title, body = ram_index
|
index = ram_index
|
||||||
query_parser = tantivy.QueryParser.for_index(index, default_args)
|
query = index.parse_query("sea whale", ["title", "body"])
|
||||||
query = query_parser.parse_query("sea whale")
|
|
||||||
|
|
||||||
top_docs = tantivy.TopDocs(10)
|
top_docs = tantivy.TopDocs(10)
|
||||||
|
|
||||||
result = searcher.search(query, top_docs)
|
result = index.searcher().search(query, top_docs)
|
||||||
print(result)
|
|
||||||
|
|
||||||
assert len(result) == 1
|
assert len(result) == 1
|
||||||
|
|
||||||
_, doc_address = result[0]
|
_, doc_address = result[0]
|
||||||
|
searched_doc = index.searcher().doc(doc_address)
|
||||||
searched_doc = searcher.doc(doc_address)
|
assert searched_doc["title"] == ["The Old Man and the Sea"]
|
||||||
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
|
||||||
|
|
||||||
def test_doc(self):
|
|
||||||
builder = tantivy.SchemaBuilder()
|
|
||||||
title = builder.add_text_field("title", stored=True)
|
|
||||||
|
|
||||||
doc = tantivy.Document()
|
|
||||||
assert doc.is_empty
|
|
||||||
|
|
||||||
doc.add_text(title, "The Old Man and the Sea")
|
|
||||||
|
|
||||||
assert doc.get_first(title) == "The Old Man and the Sea"
|
|
||||||
|
|
||||||
assert doc.len == 1
|
|
||||||
assert not doc.is_empty
|
|
||||||
|
|
||||||
def test_and_query(self, ram_index):
|
def test_and_query(self, ram_index):
|
||||||
index, searcher, schema, default_args, title, body = ram_index
|
index = ram_index
|
||||||
q_parser = tantivy.QueryParser.for_index(index, default_args)
|
query = index.parse_query("title:men AND body:summer", default_field_names=["title", "body"])
|
||||||
# look for an intersection of documents
|
# look for an intersection of documents
|
||||||
query = q_parser.parse_query("title:men AND body:summer")
|
|
||||||
top_docs = tantivy.TopDocs(10)
|
top_docs = tantivy.TopDocs(10)
|
||||||
|
searcher = index.searcher()
|
||||||
result = searcher.search(query, top_docs)
|
result = searcher.search(query, top_docs)
|
||||||
print(result)
|
|
||||||
|
|
||||||
# summer isn't present
|
# summer isn't present
|
||||||
assert len(result) == 0
|
assert len(result) == 0
|
||||||
|
|
||||||
query = q_parser.parse_query("title:men AND body:winter")
|
query = index.parse_query("title:men AND body:winter", ["title", "body"])
|
||||||
result = searcher.search(query, top_docs)
|
result = searcher.search(query, top_docs)
|
||||||
|
|
||||||
assert len(result) == 1
|
assert len(result) == 1
|
||||||
|
|
||||||
|
def test_and_query_parser_default_fields(self, ram_index):
|
||||||
|
query = ram_index.parse_query("winter", default_field_names=["title"])
|
||||||
|
assert repr(query) == """Query(TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114])))"""
|
||||||
|
|
||||||
|
def test_and_query_parser_default_fields_undefined(self, ram_index):
|
||||||
|
query = ram_index.parse_query("winter")
|
||||||
|
assert repr(query) == "Query(BooleanQuery { subqueries: [" \
|
||||||
|
"(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " \
|
||||||
|
"(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \
|
||||||
|
"})"
|
||||||
|
|
||||||
def test_query_errors(self, ram_index):
|
def test_query_errors(self, ram_index):
|
||||||
index, searcher, schema, default_args, title, body = ram_index
|
index = ram_index
|
||||||
q_parser = tantivy.QueryParser.for_index(index, default_args)
|
|
||||||
# no "bod" field
|
# no "bod" field
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
q_parser.parse_query("bod:title")
|
index.parse_query("bod:men", ["title", "body"])
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
PATH_TO_INDEX = "tests/test_index/"
|
||||||
def disk_index():
|
|
||||||
builder = tantivy.SchemaBuilder()
|
|
||||||
title = builder.add_text_field("title", stored=True)
|
|
||||||
body = builder.add_text_field("body")
|
|
||||||
default_args = [title, body]
|
|
||||||
schema = builder.build()
|
|
||||||
schema = schema
|
|
||||||
index = tantivy.Index(schema)
|
|
||||||
path_to_index = "tests/test_index/"
|
|
||||||
return index, path_to_index, schema, default_args, title, body
|
|
||||||
|
|
||||||
|
|
||||||
class TestFromDiskClass(object):
|
class TestFromDiskClass(object):
|
||||||
|
|
||||||
def test_exists(self, disk_index):
|
def test_exists(self):
|
||||||
# prefer to keep it separate in case anyone deletes this
|
# prefer to keep it separate in case anyone deletes this
|
||||||
# runs from the root directory
|
# runs from the root directory
|
||||||
index, path_to_index, _, _, _, _ = disk_index
|
assert Index.exists(PATH_TO_INDEX)
|
||||||
assert index.exists(path_to_index)
|
|
||||||
|
|
||||||
def test_opens_from_dir(self, disk_index):
|
def test_opens_from_dir(self):
|
||||||
_, path_to_index, schema, _, _, _ = disk_index
|
index = Index(schema(), PATH_TO_INDEX, reuse=True)
|
||||||
tantivy.Index(schema, path_to_index)
|
assert index.searcher().num_docs == 3
|
||||||
|
|
||||||
def test_create_readers(self, disk_index):
|
def test_create_readers(self):
|
||||||
_, path_to_index, schema, _, _, _ = disk_index
|
# not sure what is the point of this test.
|
||||||
idx = tantivy.Index(schema, path_to_index)
|
idx = Index(schema())
|
||||||
reload_policy = "OnCommit" # or "Manual"
|
assert idx.searcher().num_docs == 0
|
||||||
assert idx.reader(reload_policy, 4)
|
# by default this is manual mode
|
||||||
assert idx.reader("Manual", 4)
|
writer = idx.writer(30000000, 1)
|
||||||
|
writer.add_document(Document(title="mytitle", body="mybody"))
|
||||||
|
writer.commit()
|
||||||
|
assert idx.searcher().num_docs == 0
|
||||||
|
# Manual is the default setting.
|
||||||
|
# In this case, change are reflected only when
|
||||||
|
# the index is manually reloaded.
|
||||||
|
idx.reload()
|
||||||
|
assert idx.searcher().num_docs == 1
|
||||||
|
idx.config_reader("OnCommit", 4)
|
||||||
|
writer.add_document(Document(title="mytitle2", body="mybody2"))
|
||||||
|
writer.commit()
|
||||||
|
import time
|
||||||
|
for i in range(50):
|
||||||
|
# The index should be automatically reloaded.
|
||||||
|
# Wait for at most 5s for it to happen.
|
||||||
|
time.sleep(0.1)
|
||||||
|
if idx.searcher().num_docs == 2:
|
||||||
|
return
|
||||||
|
assert False
|
||||||
|
|
||||||
def test_create_writer_and_reader(self, disk_index):
|
|
||||||
_, path_to_index, schema, default_args, title, body = disk_index
|
|
||||||
idx = tantivy.Index(schema, path_to_index)
|
|
||||||
writer = idx.writer()
|
|
||||||
reload_policy = "OnCommit" # or "Manual"
|
|
||||||
reader = idx.reader(reload_policy, 4)
|
|
||||||
|
|
||||||
# check against the opstamp in the meta file
|
class TestSearcher(object):
|
||||||
meta_fname = "meta.json"
|
def test_searcher_repr(self, ram_index):
|
||||||
with open("{}{}".format(path_to_index, meta_fname)) as f:
|
assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)"
|
||||||
json_file = json.load(f)
|
|
||||||
expected_last_opstamp = json_file["opstamp"]
|
|
||||||
# ASSUMPTION
|
|
||||||
# We haven't had any deletes in the index
|
|
||||||
# so max_doc per index coincides with the value of `num_docs`
|
|
||||||
# summing them in all segments, gives the number of documents
|
|
||||||
expected_num_docs = sum([segment["max_doc"]
|
|
||||||
for segment in json_file["segments"]])
|
|
||||||
assert writer.commit_opstamp == expected_last_opstamp
|
|
||||||
|
|
||||||
q_parser = tantivy.QueryParser.for_index(idx, default_args)
|
|
||||||
# get all documents
|
|
||||||
query = q_parser.parse_query("*")
|
|
||||||
top_docs = tantivy.TopDocs(10)
|
|
||||||
|
|
||||||
docs = reader.searcher().search(query, top_docs)
|
class TestDocument(object):
|
||||||
for (_score, doc_addr) in docs:
|
|
||||||
print(reader.searcher().doc(doc_addr))
|
def test_document(self):
|
||||||
assert expected_num_docs == len(docs)
|
doc = tantivy.Document(name="Bill", reference=[1, 2])
|
||||||
|
assert doc["reference"] == [1, 2]
|
||||||
|
assert doc["name"] == ["Bill"]
|
||||||
|
assert doc.get_first("name") == "Bill"
|
||||||
|
assert doc.get_first("reference") == 1
|
||||||
|
assert doc.to_dict() == {"name": ["Bill"], "reference": [1, 2]}
|
||||||
|
|
||||||
|
def test_document_with_date(self):
|
||||||
|
import datetime
|
||||||
|
date = datetime.datetime(2019, 8, 12, 13, 0, 0, )
|
||||||
|
doc = tantivy.Document(name="Bill", date=date)
|
||||||
|
assert doc["date"][0] == date
|
||||||
|
|
||||||
|
def test_document_repr(self):
|
||||||
|
doc = tantivy.Document(name="Bill", reference=[1, 2])
|
||||||
|
assert repr(doc) == "Document(name=[Bill],reference=[1,2])"
|
||||||
|
|
||||||
|
def test_document_with_facet(self):
|
||||||
|
doc = tantivy.Document()
|
||||||
|
facet = tantivy.Facet.from_string("/europe/france")
|
||||||
|
doc.add_facet("facet", facet)
|
||||||
|
assert doc["facet"][0].to_path() == ['europe', 'france']
|
||||||
|
doc = tantivy.Document()
|
||||||
|
facet = tantivy.Facet.from_string("/asia\\/oceania/fiji")
|
||||||
|
doc.add_facet("facet", facet)
|
||||||
|
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
|
||||||
|
assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji"
|
||||||
|
assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)"
|
||||||
|
doc = tantivy.Document(facet=facet)
|
||||||
|
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
|
||||||
|
|
||||||
|
def test_document_error(self):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
tantivy.Document(name={})
|
||||||
|
|
Loading…
Reference in New Issue