Merge pull request #3 from tantivy-search/api-change

Different API Change.
master
Paul Masurel 2019-09-02 09:52:29 +09:00 committed by GitHub
commit f57c4669c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 656 additions and 582 deletions

View File

@ -11,7 +11,8 @@ crate-type = ["cdylib"]
[dependencies] [dependencies]
chrono = "0.4" chrono = "0.4"
tantivy = { git = "https://github.com/tantivy-search/tantivy" } tantivy = { git = "https://github.com/tantivy-search/tantivy", branch = "master" }
itertools = "0.8"
[dependencies.pyo3] [dependencies.pyo3]
version = "0.7.0" version = "0.7.0"

View File

@ -24,35 +24,37 @@ needs to be built. After that documents can be added to the index and a reader
can be created to search the index. can be created to search the index.
```python ```python
builder = tantivy.SchemaBuilder() import tantivy
title = builder.add_text_field("title", stored=True) # Declaring our schema.
body = builder.add_text_field("body") schema_builder = tantivy.SchemaBuilder()
schema_builder.add_text_field("title", stored=True)
schema_builder.add_text_field("body", stored=True)
schema = schema_builder.build()
schema = builder.build() # Creating our index (in memory, but filesystem is available too)
index = tantivy.Index(schema) index = tantivy.Index(schema)
writer = index.writer()
doc = tantivy.Document() # Adding one document.
doc.add_text(title, "The Old Man and the Sea") writer = index.writer()
doc.add_text(body, ("He was an old man who fished alone in a skiff in" writer.add_document({
"the Gulf Stream and he had gone eighty-four days " "title": "The Old Man and the Sea",
"now without taking a fish.")) "body": """He was an old man who fished alone in a skiff in
writer.add_document(doc) the Gulf Stream and he had gone eighty-four days
now without taking a fish."""
})
# ... and committing
writer.commit() writer.commit()
reader = index.reader()
searcher = reader.searcher()
query_parser = tantivy.QueryParser.for_index(index, [title, body]) # Reload the index to ensure it points to the last commit.
query = query_parser.parse_query("sea whale") index.reload();
searcher = index.searcher()
query = index.parse_query("sea whale", ["title", "body"])
top_docs = tantivy.TopDocs(3)
top_docs = tantivy.TopDocs(10) (best_score, best_doc_address) = searcher.search(query, nhits=3)[0]
result = searcher.search(query, top_docs) best_doc = searcher.doc(best_doc_address)
assert best_doc["title"] == ["The Old Man and the Sea"]
_, doc_address = result[0]
searched_doc = searcher.doc(doc_address)
assert searched_doc.get_first(title) == "The Old Man and the Sea"
``` ```

View File

@ -1,75 +1,255 @@
#![allow(clippy::new_ret_no_self)] #![allow(clippy::new_ret_no_self)]
#![allow(clippy::wrong_self_convention)]
use itertools::Itertools;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::PyDateTime; use pyo3::types::{PyAny, PyDateTime, PyDict, PyList, PyTuple};
use pyo3::types::{PyDateAccess, PyTimeAccess}; use pyo3::types::{PyDateAccess, PyTimeAccess};
use chrono::offset::TimeZone; use chrono::offset::TimeZone;
use chrono::Utc; use chrono::{Datelike, Timelike, Utc};
use tantivy as tv; use tantivy as tv;
use crate::facet::Facet; use crate::facet::Facet;
use crate::field::{Field, FieldValue}; use crate::to_pyerr;
use pyo3::{PyMappingProtocol, PyObjectProtocol};
use std::collections::BTreeMap;
use std::fmt;
use tantivy::schema::Value;
fn value_to_py(py: Python, value: &Value) -> PyResult<PyObject> {
Ok(match value {
Value::Str(text) => text.into_object(py),
Value::U64(num) => num.into_object(py),
Value::I64(num) => num.into_object(py),
Value::F64(num) => num.into_object(py),
Value::Bytes(b) => b.to_object(py),
Value::Date(d) => PyDateTime::new(
py,
d.year(),
d.month() as u8,
d.day() as u8,
d.hour() as u8,
d.minute() as u8,
d.second() as u8,
d.timestamp_subsec_micros(),
None,
)?
.into_object(py),
Value::Facet(f) => Facet { inner: f.clone() }.into_object(py),
})
}
fn value_to_string(value: &Value) -> String {
match value {
Value::Str(text) => text.clone(),
Value::U64(num) => format!("{}", num),
Value::I64(num) => format!("{}", num),
Value::F64(num) => format!("{}", num),
Value::Bytes(bytes) => format!("{:?}", bytes),
Value::Date(d) => format!("{:?}", d),
Value::Facet(facet) => facet.to_string(),
}
}
/// Tantivy's Document is the object that can be indexed and then searched for. /// Tantivy's Document is the object that can be indexed and then searched for.
/// ///
/// Documents are fundamentally a collection of unordered tuples /// Documents are fundamentally a collection of unordered tuples
/// (field, value). In this list, one field may appear more than once. /// (field_name, value). In this list, one field may appear more than once.
/// ///
/// Example: /// Example:
/// >>> doc = tantivy.Document() /// >>> doc = tantivy.Document()
/// >>> doc.add_text(title, "The Old Man and the Sea") /// >>> doc.add_text("title", "The Old Man and the Sea")
/// >>> doc.add_text(body, ("He was an old man who fished alone in a " /// >>> doc.add_text("body", ("He was an old man who fished alone in a "
/// "skiff in the Gulf Stream and he had gone " /// "skiff in the Gulf Stream and he had gone "
/// "eighty-four days now without taking a fish.")) /// "eighty-four days now without taking a fish."))
///
/// For simplicity, it is also possible to build a `Document` by passing the field
/// values directly as constructor arguments.
///
/// Example:
/// >>> doc = tantivy.Document(title=["The Old Man and the Sea"], body=["..."])
///
/// As syntactic sugar, tantivy also allows the user to pass a single values
/// if there is only one. In other words, the following is also legal.
///
/// Example:
/// >>> doc = tantivy.Document(title="The Old Man and the Sea", body="...")
#[pyclass] #[pyclass]
#[derive(Default)]
pub(crate) struct Document { pub(crate) struct Document {
pub(crate) inner: tv::Document, pub(crate) field_values: BTreeMap<String, Vec<tv::schema::Value>>,
}
impl fmt::Debug for Document {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let doc_str = self
.field_values
.iter()
.map(|(field_name, field_values)| {
let mut values_str =
field_values.iter().map(value_to_string).join(",");
values_str.truncate(10);
format!("{}=[{}]", field_name, values_str)
})
.join(",");
write!(f, "Document({})", doc_str)
}
}
fn add_value<T>(doc: &mut Document, field_name: String, value: T)
where
Value: From<T>,
{
doc.field_values
.entry(field_name)
.or_insert_with(Vec::new)
.push(Value::from(value));
}
fn extract_value(any: &PyAny) -> PyResult<Value> {
if let Ok(s) = any.extract::<String>() {
return Ok(Value::Str(s));
}
if let Ok(num) = any.extract::<i64>() {
return Ok(Value::I64(num));
}
if let Ok(num) = any.extract::<f64>() {
return Ok(Value::F64(num));
}
if let Ok(py_datetime) = any.downcast_ref::<PyDateTime>() {
let datetime = Utc
.ymd(
py_datetime.get_year(),
py_datetime.get_month().into(),
py_datetime.get_day().into(),
)
.and_hms_micro(
py_datetime.get_hour().into(),
py_datetime.get_minute().into(),
py_datetime.get_second().into(),
py_datetime.get_microsecond(),
);
return Ok(Value::Date(datetime));
}
if let Ok(facet) = any.downcast_ref::<Facet>() {
return Ok(Value::Facet(facet.inner.clone()));
}
Err(to_pyerr(format!("Value unsupported {:?}", any)))
}
fn extract_value_single_or_list(any: &PyAny) -> PyResult<Vec<Value>> {
if let Ok(values) = any.downcast_ref::<PyList>() {
values.iter().map(extract_value).collect()
} else {
Ok(vec![extract_value(any)?])
}
} }
#[pymethods] #[pymethods]
impl Document { impl Document {
#[new] #[new]
fn new(obj: &PyRawObject) { #[args(kwargs = "**")]
obj.init(Document { fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
inner: tv::Document::default(), let mut document = Document::default();
}); if let Some(field_dict) = kwargs {
document.extend(field_dict)?;
}
obj.init(document);
Ok(())
}
fn extend(&mut self, py_dict: &PyDict) -> PyResult<()> {
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
BTreeMap::new();
for key_value_any in py_dict.items() {
if let Ok(key_value) = key_value_any.downcast_ref::<PyTuple>() {
if key_value.len() != 2 {
continue;
}
let key: String = key_value.get_item(0).extract()?;
let value_list =
extract_value_single_or_list(key_value.get_item(1))?;
field_values.insert(key, value_list);
}
}
self.field_values.extend(field_values.into_iter());
Ok(())
}
#[staticmethod]
fn from_dict(py_dict: &PyDict) -> PyResult<Document> {
let mut field_values: BTreeMap<String, Vec<tv::schema::Value>> =
BTreeMap::new();
for key_value_any in py_dict.items() {
if let Ok(key_value) = key_value_any.downcast_ref::<PyTuple>() {
if key_value.len() != 2 {
continue;
}
let key: String = key_value.get_item(0).extract()?;
let value_list =
extract_value_single_or_list(key_value.get_item(1))?;
field_values.insert(key, value_list);
}
}
Ok(Document { field_values })
}
/// Returns a dictionary with the different
/// field values.
///
/// In tantivy, `Document` can be hold multiple
/// values for a single field.
///
/// For this reason, the dictionary, will associate
/// a list of value for every field.
fn to_dict(&self, py: Python) -> PyResult<PyObject> {
let dict = PyDict::new(py);
for (key, values) in &self.field_values {
let values_py: Vec<PyObject> = values
.iter()
.map(|v| value_to_py(py, v))
.collect::<PyResult<_>>()?;
dict.set_item(key, values_py)?;
}
Ok(dict.into())
} }
/// Add a text value to the document. /// Add a text value to the document.
/// ///
/// Args: /// Args:
/// field (Field): The field for which we are adding the text. /// field_name (str): The field name for which we are adding the text.
/// text (str): The text that will be added to the document. /// text (str): The text that will be added to the document.
fn add_text(&mut self, field: &Field, text: &str) { fn add_text(&mut self, field_name: String, text: &str) {
self.inner.add_text(field.inner, text); add_value(self, field_name, text);
} }
/// Add an unsigned integer value to the document. /// Add an unsigned integer value to the document.
/// ///
/// Args: /// Args:
/// field (Field): The field for which we are adding the integer. /// field_name (str): The field name for which we are adding the unsigned integer.
/// value (int): The integer that will be added to the document. /// value (int): The integer that will be added to the document.
fn add_unsigned(&mut self, field: &Field, value: u64) { fn add_unsigned(&mut self, field_name: String, value: u64) {
self.inner.add_u64(field.inner, value); add_value(self, field_name, value);
} }
/// Add a signed integer value to the document. /// Add a signed integer value to the document.
/// ///
/// Args: /// Args:
/// field (Field): The field for which we are adding the integer. /// field_name (str): The field name for which we are adding the integer.
/// value (int): The integer that will be added to the document. /// value (int): The integer that will be added to the document.
fn add_integer(&mut self, field: &Field, value: i64) { fn add_integer(&mut self, field_name: String, value: i64) {
self.inner.add_i64(field.inner, value); add_value(self, field_name, value);
} }
/// Add a date value to the document. /// Add a date value to the document.
/// ///
/// Args: /// Args:
/// field (Field): The field for which we are adding the integer. /// field_name (str): The field name for which we are adding the date.
/// value (datetime): The date that will be added to the document. /// value (datetime): The date that will be added to the document.
fn add_date(&mut self, field: &Field, value: &PyDateTime) { fn add_date(&mut self, field_name: String, value: &PyDateTime) {
let datetime = Utc let datetime = Utc
.ymd( .ymd(
value.get_year(), value.get_year(),
@ -82,37 +262,36 @@ impl Document {
value.get_second().into(), value.get_second().into(),
value.get_microsecond(), value.get_microsecond(),
); );
add_value(self, field_name, datetime);
self.inner.add_date(field.inner, &datetime);
} }
/// Add a facet value to the document. /// Add a facet value to the document.
/// Args: /// Args:
/// field (Field): The field for which we are adding the facet. /// field_name (str): The field name for which we are adding the facet.
/// value (Facet): The Facet that will be added to the document. /// value (Facet): The Facet that will be added to the document.
fn add_facet(&mut self, field: &Field, value: &Facet) { fn add_facet(&mut self, field_name: String, facet: &Facet) {
self.inner.add_facet(field.inner, value.inner.clone()); add_value(self, field_name, facet.inner.clone());
} }
/// Add a bytes value to the document. /// Add a bytes value to the document.
/// ///
/// Args: /// Args:
/// field (Field): The field for which we are adding the bytes. /// field_name (str): The field for which we are adding the bytes.
/// value (bytes): The bytes that will be added to the document. /// value (bytes): The bytes that will be added to the document.
fn add_bytes(&mut self, field: &Field, value: Vec<u8>) { fn add_bytes(&mut self, field_name: String, bytes: Vec<u8>) {
self.inner.add_bytes(field.inner, value); add_value(self, field_name, bytes);
} }
/// Returns the number of added fields that have been added to the document /// Returns the number of added fields that have been added to the document
#[getter] #[getter]
fn len(&self) -> usize { fn num_fields(&self) -> usize {
self.inner.len() self.field_values.len()
} }
/// True if the document is empty, False otherwise. /// True if the document is empty, False otherwise.
#[getter] #[getter]
fn is_empty(&self) -> bool { fn is_empty(&self) -> bool {
self.inner.is_empty() self.field_values.is_empty()
} }
/// Get the first value associated with the given field. /// Get the first value associated with the given field.
@ -122,9 +301,17 @@ impl Document {
/// ///
/// Returns the value if one is found, otherwise None. /// Returns the value if one is found, otherwise None.
/// The type of the value depends on the field. /// The type of the value depends on the field.
fn get_first(&self, py: Python, field: &Field) -> Option<PyObject> { fn get_first(
let value = self.inner.get_first(field.inner)?; &self,
FieldValue::value_to_py(py, value) py: Python,
fieldname: &str,
) -> PyResult<Option<PyObject>> {
if let Some(value) = self.iter_values_for_field(fieldname).next() {
let py_value = value_to_py(py, value)?;
Ok(Some(py_value))
} else {
Ok(None)
}
} }
/// Get the all values associated with the given field. /// Get the all values associated with the given field.
@ -134,21 +321,37 @@ impl Document {
/// ///
/// Returns a list of values. /// Returns a list of values.
/// The type of the value depends on the field. /// The type of the value depends on the field.
fn get_all(&self, py: Python, field: &Field) -> Vec<PyObject> { fn get_all(&self, py: Python, field_name: &str) -> PyResult<Vec<PyObject>> {
let values = self.inner.get_all(field.inner); self.iter_values_for_field(field_name)
values .map(|value| value_to_py(py, value))
.iter() .collect::<PyResult<Vec<_>>>()
.map(|&v| FieldValue::value_to_py(py, v)) }
.filter_map(|x| x)
.collect()
} }
/// Get all the fields and values contained in the document. impl Document {
fn field_values(&self, py: Python) -> Vec<FieldValue> { fn iter_values_for_field<'a>(
let field_values = self.inner.field_values(); &'a self,
field_values field: &str,
.iter() ) -> impl Iterator<Item = &'a Value> + 'a {
.map(|v| FieldValue::field_value_to_py(py, v)) self.field_values
.collect() .get(field)
.into_iter()
.flat_map(|values| values.iter())
}
}
#[pyproto]
impl PyMappingProtocol for Document {
fn __getitem__(&self, field_name: &str) -> PyResult<Vec<PyObject>> {
let gil = Python::acquire_gil();
let py = gil.python();
self.get_all(py, field_name)
}
}
#[pyproto]
impl PyObjectProtocol for Document {
fn __repr__(&self) -> PyResult<String> {
Ok(format!("{:?}", self))
} }
} }

View File

@ -1,6 +1,6 @@
use pyo3::basic::PyObjectProtocol;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::PyType; use pyo3::types::PyType;
use tantivy::schema; use tantivy::schema;
/// A Facet represent a point in a given hierarchy. /// A Facet represent a point in a given hierarchy.
@ -52,4 +52,23 @@ impl Facet {
inner: schema::Facet::from_text(facet_string), inner: schema::Facet::from_text(facet_string),
} }
} }
/// Returns the list of `segments` that forms a facet path.
///
/// For instance `//europe/france` becomes `["europe", "france"]`.
fn to_path(&self) -> Vec<&str> {
self.inner.to_path()
}
/// Returns the facet string representation.
fn to_path_str(&self) -> String {
self.inner.to_string()
}
}
#[pyproto]
impl PyObjectProtocol for Facet {
fn __repr__(&self) -> PyResult<String> {
Ok(format!("Facet({})", self.to_path_str()))
}
} }

View File

@ -1,73 +0,0 @@
use pyo3::prelude::*;
use pyo3::types::PyDateTime;
use tantivy::schema;
use crate::facet::Facet;
/// Field is a numeric indentifier that represents an entry in the Schema.
#[pyclass]
#[derive(Clone)]
pub(crate) struct Field {
pub(crate) inner: schema::Field,
}
/// FieldValue holds together a Field and its Value.
#[pyclass]
pub(crate) struct FieldValue {
pub(crate) field: Field,
pub(crate) value: PyObject,
}
#[pymethods]
impl FieldValue {
#[getter]
fn field(&self) -> Field {
self.field.clone()
}
#[getter]
fn value(&self) -> &PyObject {
&self.value
}
}
impl FieldValue {
pub(crate) fn value_to_py(
py: Python,
value: &schema::Value,
) -> Option<PyObject> {
match value {
schema::Value::Str(text) => Some(text.into_object(py)),
schema::Value::U64(num) => Some(num.into_object(py)),
schema::Value::I64(num) => Some(num.into_object(py)),
schema::Value::F64(num) => Some(num.into_object(py)),
schema::Value::Bytes(b) => Some(b.to_object(py)),
schema::Value::Date(d) => {
let date =
PyDateTime::from_timestamp(py, d.timestamp() as f64, None);
match date {
Ok(d) => Some(d.into_object(py)),
Err(_e) => None,
}
}
schema::Value::Facet(f) => {
Some(Facet { inner: f.clone() }.into_object(py))
}
}
}
pub(crate) fn field_value_to_py(
py: Python,
field_value: &schema::FieldValue,
) -> FieldValue {
let value = field_value.value();
let field = field_value.field();
FieldValue {
field: Field { inner: field },
value: FieldValue::value_to_py(py, value).unwrap(),
}
}
}

View File

@ -4,60 +4,24 @@ use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use crate::document::Document; use crate::document::Document;
use crate::query::Query;
use crate::schema::Schema; use crate::schema::Schema;
use crate::searcher::Searcher; use crate::searcher::Searcher;
use crate::to_pyerr;
use tantivy as tv; use tantivy as tv;
use tantivy::directory::MmapDirectory; use tantivy::directory::MmapDirectory;
use tantivy::schema::{Field, NamedFieldDocument};
const RELOAD_POLICY: &str = "commit"; const RELOAD_POLICY: &str = "commit";
/// IndexReader is the entry point to read and search the index.
///
/// IndexReader controls when a new version of the index should be loaded and
/// lends you instances of Searcher for the last loaded version.
///
/// To create an IndexReader first create an Index and call the reader() method
/// on the index object.
#[pyclass]
pub(crate) struct IndexReader {
inner: tv::IndexReader,
}
#[pymethods]
impl IndexReader {
/// Update searchers so that they reflect the state of the last .commit().
///
/// If you set up the the reload policy to be on 'commit' (which is the
/// default) every commit should be rapidly reflected on your IndexReader
/// and you should not need to call reload() at all.
fn reload(&self) -> PyResult<()> {
let ret = self.inner.reload();
match ret {
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Get a Searcher for the index.
///
/// This method should be called every single time a search query is
/// performed. The searchers are taken from a pool of num_searchers
/// searchers.
///
/// Returns a Searcher object, if no searcher is available this may block.
fn searcher(&self) -> Searcher {
let searcher = self.inner.searcher();
Searcher { inner: searcher }
}
}
/// IndexWriter is the user entry-point to add documents to the index. /// IndexWriter is the user entry-point to add documents to the index.
/// ///
/// To create an IndexWriter first create an Index and call the writer() method /// To create an IndexWriter first create an Index and call the writer() method
/// on the index object. /// on the index object.
#[pyclass] #[pyclass]
pub(crate) struct IndexWriter { pub(crate) struct IndexWriter {
inner: tv::IndexWriter, inner_index_writer: tv::IndexWriter,
schema: tv::schema::Schema,
} }
#[pymethods] #[pymethods]
@ -70,9 +34,24 @@ impl IndexWriter {
/// by the client to align commits with its own document queue. /// by the client to align commits with its own document queue.
/// The `opstamp` represents the number of documents that have been added /// The `opstamp` represents the number of documents that have been added
/// since the creation of the index. /// since the creation of the index.
fn add_document(&mut self, document: &Document) -> PyResult<()> { pub fn add_document(&mut self, doc: &Document) -> PyResult<u64> {
self.inner.add_document(document.inner.clone()); let named_doc = NamedFieldDocument(doc.field_values.clone());
Ok(()) let doc = self.schema.convert_named_doc(named_doc).map_err(to_pyerr)?;
Ok(self.inner_index_writer.add_document(doc))
}
/// Helper for the `add_document` method, but passing a json string.
///
/// If the indexing pipeline is full, this call may block.
///
/// Returns an `opstamp`, which is an increasing integer that can be used
/// by the client to align commits with its own document queue.
/// The `opstamp` represents the number of documents that have been added
/// since the creation of the index.
pub fn add_json(&mut self, json: &str) -> PyResult<u64> {
let doc = self.schema.parse_document(json).map_err(to_pyerr)?;
let opstamp = self.inner_index_writer.add_document(doc);
Ok(opstamp)
} }
/// Commits all of the pending changes /// Commits all of the pending changes
@ -84,12 +63,8 @@ impl IndexWriter {
/// spared), it will be possible to resume indexing from this point. /// spared), it will be possible to resume indexing from this point.
/// ///
/// Returns the `opstamp` of the last document that made it in the commit. /// Returns the `opstamp` of the last document that made it in the commit.
fn commit(&mut self) -> PyResult<()> { fn commit(&mut self) -> PyResult<u64> {
let ret = self.inner.commit(); self.inner_index_writer.commit().map_err(to_pyerr)
match ret {
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
} }
/// Rollback to the last commit /// Rollback to the last commit
@ -97,23 +72,15 @@ impl IndexWriter {
/// This cancels all of the update that happened before after the last /// This cancels all of the update that happened before after the last
/// commit. After calling rollback, the index is in the same state as it /// commit. After calling rollback, the index is in the same state as it
/// was after the last commit. /// was after the last commit.
fn rollback(&mut self) -> PyResult<()> { fn rollback(&mut self) -> PyResult<u64> {
let ret = self.inner.rollback(); self.inner_index_writer.rollback().map_err(to_pyerr)
match ret {
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
} }
/// Detect and removes the files that are not used by the index anymore. /// Detect and removes the files that are not used by the index anymore.
fn garbage_collect_files(&mut self) -> PyResult<()> { fn garbage_collect_files(&mut self) -> PyResult<()> {
let ret = self.inner.garbage_collect_files(); self.inner_index_writer
.garbage_collect_files()
match ret { .map_err(to_pyerr)
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
} }
/// The opstamp of the last successful commit. /// The opstamp of the last successful commit.
@ -125,7 +92,7 @@ impl IndexWriter {
/// for searchers. /// for searchers.
#[getter] #[getter]
fn commit_opstamp(&self) -> u64 { fn commit_opstamp(&self) -> u64 {
self.inner.commit_opstamp() self.inner_index_writer.commit_opstamp()
} }
} }
@ -142,11 +109,19 @@ impl IndexWriter {
/// if there was a problem during the opening or creation of the index. /// if there was a problem during the opening or creation of the index.
#[pyclass] #[pyclass]
pub(crate) struct Index { pub(crate) struct Index {
pub(crate) inner: tv::Index, pub(crate) index: tv::Index,
reader: tv::IndexReader,
} }
#[pymethods] #[pymethods]
impl Index { impl Index {
#[staticmethod]
fn open(path: &str) -> PyResult<Index> {
let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
let reader = index.reader().map_err(to_pyerr)?;
Ok(Index { index, reader })
}
#[new] #[new]
#[args(reuse = true)] #[args(reuse = true)]
fn new( fn new(
@ -157,32 +132,19 @@ impl Index {
) -> PyResult<()> { ) -> PyResult<()> {
let index = match path { let index = match path {
Some(p) => { Some(p) => {
let directory = MmapDirectory::open(p); let directory = MmapDirectory::open(p).map_err(to_pyerr)?;
if reuse {
let dir = match directory { tv::Index::open_or_create(directory, schema.inner.clone())
Ok(d) => d,
Err(e) => {
return Err(exceptions::OSError::py_err(e.to_string()))
}
};
let i = if reuse {
tv::Index::open_or_create(dir, schema.inner.clone())
} else { } else {
tv::Index::create(dir, schema.inner.clone()) tv::Index::create(directory, schema.inner.clone())
};
match i {
Ok(index) => index,
Err(e) => {
return Err(exceptions::OSError::py_err(e.to_string()))
}
} }
.map_err(to_pyerr)?
} }
None => tv::Index::create_in_ram(schema.inner.clone()), None => tv::Index::create_in_ram(schema.inner.clone()),
}; };
obj.init(Index { inner: index }); let reader = index.reader().map_err(to_pyerr)?;
obj.init(Index { index, reader });
Ok(()) Ok(())
} }
@ -206,32 +168,30 @@ impl Index {
num_threads: usize, num_threads: usize,
) -> PyResult<IndexWriter> { ) -> PyResult<IndexWriter> {
let writer = match num_threads { let writer = match num_threads {
0 => self.inner.writer(heap_size), 0 => self.index.writer(heap_size),
_ => self.inner.writer_with_num_threads(num_threads, heap_size), _ => self.index.writer_with_num_threads(num_threads, heap_size),
};
match writer {
Ok(w) => Ok(IndexWriter { inner: w }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
} }
.map_err(to_pyerr)?;
let schema = self.index.schema();
Ok(IndexWriter {
inner_index_writer: writer,
schema,
})
} }
/// Create an IndexReader for the index. /// Configure the index reader.
/// ///
/// Args: /// Args:
/// reload_policy (str, optional): The reload policy that the /// reload_policy (str, optional): The reload policy that the
/// IndexReader should use. Can be manual or OnCommit. /// IndexReader should use. Can be `Manual` or `OnCommit`.
/// num_searchers (int, optional): The number of searchers that the /// num_searchers (int, optional): The number of searchers that the
/// reader should create. /// reader should create.
///
/// Returns the IndexReader on success, raises ValueError if a IndexReader
/// couldn't be created.
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)] #[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
fn reader( fn config_reader(
&self, &mut self,
reload_policy: &str, reload_policy: &str,
num_searchers: usize, num_searchers: usize,
) -> PyResult<IndexReader> { ) -> Result<(), PyErr> {
let reload_policy = reload_policy.to_lowercase(); let reload_policy = reload_policy.to_lowercase();
let reload_policy = match reload_policy.as_ref() { let reload_policy = match reload_policy.as_ref() {
"commit" => tv::ReloadPolicy::OnCommit, "commit" => tv::ReloadPolicy::OnCommit,
@ -242,9 +202,7 @@ impl Index {
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'" "Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
)) ))
}; };
let builder = self.index.reader_builder();
let builder = self.inner.reader_builder();
let builder = builder.reload_policy(reload_policy); let builder = builder.reload_policy(reload_policy);
let builder = if num_searchers > 0 { let builder = if num_searchers > 0 {
builder.num_searchers(num_searchers) builder.num_searchers(num_searchers)
@ -252,10 +210,23 @@ impl Index {
builder builder
}; };
let reader = builder.try_into(); self.reader = builder.try_into().map_err(to_pyerr)?;
match reader { Ok(())
Ok(r) => Ok(IndexReader { inner: r }), }
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
/// Acquires a Searcher from the searcher pool.
///
/// If no searcher is available during the call, note that
/// this call will block until one is made available.
///
/// Searcher are automatically released back into the pool when
/// they are dropped. If you observe this function to block forever
/// you probably should configure the Index to have a larger
/// searcher pool, or you are holding references to previous searcher
/// for ever.
fn searcher(&self) -> Searcher {
Searcher {
inner: self.reader.searcher(),
} }
} }
@ -268,19 +239,71 @@ impl Index {
/// Raises OSError if the directory cannot be opened. /// Raises OSError if the directory cannot be opened.
#[staticmethod] #[staticmethod]
fn exists(path: &str) -> PyResult<bool> { fn exists(path: &str) -> PyResult<bool> {
let directory = MmapDirectory::open(path); let directory = MmapDirectory::open(path).map_err(to_pyerr)?;
let dir = match directory { Ok(tv::Index::exists(&directory))
Ok(d) => d,
Err(e) => return Err(exceptions::OSError::py_err(e.to_string())),
};
Ok(tv::Index::exists(&dir))
} }
/// The schema of the current index. /// The schema of the current index.
#[getter] #[getter]
fn schema(&self) -> Schema { fn schema(&self) -> Schema {
let schema = self.inner.schema(); let schema = self.index.schema();
Schema { inner: schema } Schema { inner: schema }
} }
/// Update searchers so that they reflect the state of the last .commit().
///
/// If you set up the the reload policy to be on 'commit' (which is the
/// default) every commit should be rapidly reflected on your IndexReader
/// and you should not need to call reload() at all.
fn reload(&self) -> PyResult<()> {
self.reader.reload().map_err(to_pyerr)
}
/// Parse a query
///
/// Args:
/// query: the query, following the tantivy query language.
/// default_fields (List[Field]): A list of fields used to search if no
/// field is specified in the query.
///
#[args(reload_policy = "RELOAD_POLICY")]
pub fn parse_query(
&self,
query: &str,
default_field_names: Option<Vec<String>>,
) -> PyResult<Query> {
let mut default_fields = vec![];
let schema = self.index.schema();
if let Some(default_field_names_vec) = default_field_names {
for default_field_name in &default_field_names_vec {
if let Some(field) = schema.get_field(default_field_name) {
let field_entry = schema.get_field_entry(field);
if !field_entry.is_indexed() {
return Err(exceptions::ValueError::py_err(format!(
"Field `{}` is not set as indexed in the schema.",
default_field_name
)));
}
default_fields.push(field);
} else {
return Err(exceptions::ValueError::py_err(format!(
"Field `{}` is not defined in the schema.",
default_field_name
)));
}
}
} else {
for (field_id, field_entry) in
self.index.schema().fields().iter().enumerate()
{
if field_entry.is_indexed() {
default_fields.push(Field(field_id as u32));
}
}
}
let parser =
tv::query::QueryParser::for_index(&self.index, default_fields);
let query = parser.parse_query(query).map_err(to_pyerr)?;
Ok(Query { inner: query })
}
} }

View File

@ -1,8 +1,8 @@
use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
mod document; mod document;
mod facet; mod facet;
mod field;
mod index; mod index;
mod query; mod query;
mod schema; mod schema;
@ -11,9 +11,7 @@ mod searcher;
use document::Document; use document::Document;
use facet::Facet; use facet::Facet;
use field::{Field, FieldValue};
use index::Index; use index::Index;
use query::QueryParser;
use schema::Schema; use schema::Schema;
use schemabuilder::SchemaBuilder; use schemabuilder::SchemaBuilder;
use searcher::{DocAddress, Searcher, TopDocs}; use searcher::{DocAddress, Searcher, TopDocs};
@ -75,14 +73,14 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Schema>()?; m.add_class::<Schema>()?;
m.add_class::<SchemaBuilder>()?; m.add_class::<SchemaBuilder>()?;
m.add_class::<Searcher>()?; m.add_class::<Searcher>()?;
m.add_class::<Index>()?;
m.add_class::<QueryParser>()?;
m.add_class::<Document>()?; m.add_class::<Document>()?;
m.add_class::<Index>()?;
m.add_class::<DocAddress>()?; m.add_class::<DocAddress>()?;
m.add_class::<TopDocs>()?; m.add_class::<TopDocs>()?;
m.add_class::<Field>()?;
m.add_class::<FieldValue>()?;
m.add_class::<Facet>()?; m.add_class::<Facet>()?;
Ok(()) Ok(())
} }
pub(crate) fn to_pyerr<E: ToString>(err: E) -> PyErr {
exceptions::ValueError::py_err(err.to_string())
}

View File

@ -1,70 +1,16 @@
use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::PyType; use pyo3::PyObjectProtocol;
use tantivy as tv; use tantivy as tv;
use crate::field::Field;
use crate::index::Index;
/// Tantivy's Query /// Tantivy's Query
#[pyclass] #[pyclass]
pub(crate) struct Query { pub(crate) struct Query {
pub(crate) inner: Box<dyn tv::query::Query>, pub(crate) inner: Box<dyn tv::query::Query>,
} }
/// Tantivy's Query parser #[pyproto]
#[pyclass] impl PyObjectProtocol for Query {
pub(crate) struct QueryParser { fn __repr__(&self) -> PyResult<String> {
inner: tv::query::QueryParser, Ok(format!("Query({:?})", self.inner))
}
#[pymethods]
impl QueryParser {
/// Creates a QueryParser for an Index.
///
/// Args:
/// index (Index): The index for which the query will be created.
/// default_fields (List[Field]): A list of fields used to search if no
/// field is specified in the query.
///
/// Returns the QueryParser.
#[classmethod]
fn for_index(
_cls: &PyType,
index: &Index,
default_fields: Vec<&Field>,
) -> PyResult<QueryParser> {
let default_fields: Vec<tv::schema::Field> =
default_fields.iter().map(|&f| f.inner).collect();
let parser =
tv::query::QueryParser::for_index(&index.inner, default_fields);
Ok(QueryParser { inner: parser })
}
/// Parse a string into a query that can be given to a searcher.
///
/// Args:
/// query (str): A query string that should be parsed into a query.
///
/// Returns the parsed Query object. Raises ValueError if there was an
/// error with the query string.
fn parse_query(&self, query: &str) -> PyResult<Query> {
let ret = self.inner.parse_query(query);
match ret {
Ok(q) => Ok(Query { inner: q }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Set the default way to compose queries to a conjunction.
///
/// By default, the query happy tax payer is equivalent to the query happy
/// OR tax OR payer. After calling .set_conjunction_by_default() happy tax
/// payer will be interpreted by the parser as happy AND tax AND payer.
fn set_conjunction_by_default(&mut self) {
self.inner.set_conjunction_by_default();
} }
} }

View File

@ -1,10 +1,5 @@
use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use tantivy as tv;
use tantivy::schema;
use crate::document::Document;
use crate::field::Field;
/// Tantivy schema. /// Tantivy schema.
/// ///
@ -12,54 +7,8 @@ use crate::field::Field;
/// provided. /// provided.
#[pyclass] #[pyclass]
pub(crate) struct Schema { pub(crate) struct Schema {
pub(crate) inner: schema::Schema, pub(crate) inner: tv::schema::Schema,
} }
#[pymethods] #[pymethods]
impl Schema { impl Schema {}
/// Build a document object from a json string.
///
/// Args:
/// doc_json (str) - A string containing json that should be parsed
/// into a `Document`
///
/// Returns the parsed document, raises a ValueError if the parsing failed.
fn parse_document(&self, doc_json: &str) -> PyResult<Document> {
let ret = self.inner.parse_document(doc_json);
match ret {
Ok(d) => Ok(Document { inner: d }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Convert a `Document` object into a json string.
///
/// Args:
/// doc (Document): The document that will be converted into a json
/// string.
fn to_json(&self, doc: &Document) -> String {
self.inner.to_json(&doc.inner)
}
/// Return the field name for a given `Field`.
///
/// Args:
/// field (Field): The field for which the name will be returned.
fn get_field_name(&self, field: &Field) -> &str {
self.inner.get_field_name(field.inner)
}
/// Returns the field option associated with a given name.
///
/// Args:
/// name (str): The name of the field that we want to retrieve.
///
/// Returns the Field if one is found, None otherwise.
fn get_field(&self, name: &str) -> Option<Field> {
let f = self.inner.get_field(name);
match f {
Some(field) => Some(Field { inner: field }),
None => None,
}
}
}

View File

@ -5,8 +5,8 @@ use pyo3::prelude::*;
use tantivy::schema; use tantivy::schema;
use crate::field::Field;
use crate::schema::Schema; use crate::schema::Schema;
use std::sync::{Arc, RwLock};
/// Tantivy has a very strict schema. /// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not, /// You need to specify in advance whether a field is indexed or not,
@ -24,8 +24,9 @@ use crate::schema::Schema;
/// ///
/// >>> schema = builder.build() /// >>> schema = builder.build()
#[pyclass] #[pyclass]
#[derive(Clone)]
pub(crate) struct SchemaBuilder { pub(crate) struct SchemaBuilder {
pub(crate) builder: Option<schema::SchemaBuilder>, pub(crate) builder: Arc<RwLock<Option<schema::SchemaBuilder>>>,
} }
const TOKENIZER: &str = "default"; const TOKENIZER: &str = "default";
@ -36,7 +37,7 @@ impl SchemaBuilder {
#[new] #[new]
fn new(obj: &PyRawObject) { fn new(obj: &PyRawObject) {
obj.init(SchemaBuilder { obj.init(SchemaBuilder {
builder: Some(schema::Schema::builder()), builder: Arc::new(From::from(Some(schema::Schema::builder()))),
}); });
} }
@ -70,9 +71,8 @@ impl SchemaBuilder {
stored: bool, stored: bool,
tokenizer_name: &str, tokenizer_name: &str,
index_option: &str, index_option: &str,
) -> PyResult<Field> { ) -> PyResult<Self> {
let builder = &mut self.builder; let builder = &mut self.builder;
let index_option = match index_option { let index_option = match index_option {
"position" => schema::IndexRecordOption::WithFreqsAndPositions, "position" => schema::IndexRecordOption::WithFreqsAndPositions,
"freq" => schema::IndexRecordOption::WithFreqs, "freq" => schema::IndexRecordOption::WithFreqs,
@ -94,14 +94,14 @@ impl SchemaBuilder {
options options
}; };
if let Some(builder) = builder { if let Some(builder) = builder.write().unwrap().as_mut() {
let field = builder.add_text_field(name, options); builder.add_text_field(name, options);
Ok(Field { inner: field })
} else { } else {
Err(exceptions::ValueError::py_err( return Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.", "Schema builder object isn't valid anymore.",
)) ));
} }
Ok(self.clone())
} }
/// Add a new signed integer field to the schema. /// Add a new signed integer field to the schema.
@ -131,19 +131,19 @@ impl SchemaBuilder {
stored: bool, stored: bool,
indexed: bool, indexed: bool,
fast: Option<&str>, fast: Option<&str>,
) -> PyResult<Field> { ) -> PyResult<Self> {
let builder = &mut self.builder; let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
if let Some(builder) = builder { if let Some(builder) = builder.write().unwrap().as_mut() {
let field = builder.add_i64_field(name, opts); builder.add_i64_field(name, opts);
Ok(Field { inner: field })
} else { } else {
Err(exceptions::ValueError::py_err( return Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.", "Schema builder object isn't valid anymore.",
)) ));
} }
Ok(self.clone())
} }
/// Add a new unsigned integer field to the schema. /// Add a new unsigned integer field to the schema.
@ -173,19 +173,19 @@ impl SchemaBuilder {
stored: bool, stored: bool,
indexed: bool, indexed: bool,
fast: Option<&str>, fast: Option<&str>,
) -> PyResult<Field> { ) -> PyResult<Self> {
let builder = &mut self.builder; let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
if let Some(builder) = builder { if let Some(builder) = builder.write().unwrap().as_mut() {
let field = builder.add_u64_field(name, opts); builder.add_u64_field(name, opts);
Ok(Field { inner: field })
} else { } else {
Err(exceptions::ValueError::py_err( return Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.", "Schema builder object isn't valid anymore.",
)) ));
} }
Ok(self.clone())
} }
/// Add a new date field to the schema. /// Add a new date field to the schema.
@ -215,35 +215,35 @@ impl SchemaBuilder {
stored: bool, stored: bool,
indexed: bool, indexed: bool,
fast: Option<&str>, fast: Option<&str>,
) -> PyResult<Field> { ) -> PyResult<Self> {
let builder = &mut self.builder; let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
if let Some(builder) = builder { if let Some(builder) = builder.write().unwrap().as_mut() {
let field = builder.add_date_field(name, opts); builder.add_date_field(name, opts);
Ok(Field { inner: field })
} else { } else {
Err(exceptions::ValueError::py_err( return Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.", "Schema builder object isn't valid anymore.",
)) ));
} }
Ok(self.clone())
} }
/// Add a Facet field to the schema. /// Add a Facet field to the schema.
/// Args: /// Args:
/// name (str): The name of the field. /// name (str): The name of the field.
fn add_facet_field(&mut self, name: &str) -> PyResult<Field> { fn add_facet_field(&mut self, name: &str) -> PyResult<Self> {
let builder = &mut self.builder; let builder = &mut self.builder;
if let Some(builder) = builder { if let Some(builder) = builder.write().unwrap().as_mut() {
let field = builder.add_facet_field(name); builder.add_facet_field(name);
Ok(Field { inner: field })
} else { } else {
Err(exceptions::ValueError::py_err( return Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.", "Schema builder object isn't valid anymore.",
)) ));
} }
Ok(self.clone())
} }
/// Add a fast bytes field to the schema. /// Add a fast bytes field to the schema.
@ -254,17 +254,17 @@ impl SchemaBuilder {
/// ///
/// Args: /// Args:
/// name (str): The name of the field. /// name (str): The name of the field.
fn add_bytes_field(&mut self, name: &str) -> PyResult<Field> { fn add_bytes_field(&mut self, name: &str) -> PyResult<Self> {
let builder = &mut self.builder; let builder = &mut self.builder;
if let Some(builder) = builder { if let Some(builder) = builder.write().unwrap().as_mut() {
let field = builder.add_bytes_field(name); builder.add_bytes_field(name);
Ok(Field { inner: field })
} else { } else {
Err(exceptions::ValueError::py_err( return Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.", "Schema builder object isn't valid anymore.",
)) ));
} }
Ok(self.clone())
} }
/// Finalize the creation of a Schema. /// Finalize the creation of a Schema.
@ -272,7 +272,7 @@ impl SchemaBuilder {
/// Returns a Schema object. After this is called the SchemaBuilder cannot /// Returns a Schema object. After this is called the SchemaBuilder cannot
/// be used anymore. /// be used anymore.
fn build(&mut self) -> PyResult<Schema> { fn build(&mut self) -> PyResult<Schema> {
let builder = self.builder.take(); let builder = self.builder.write().unwrap().take();
if let Some(builder) = builder { if let Some(builder) = builder {
let schema = builder.build(); let schema = builder.build();
Ok(Schema { inner: schema }) Ok(Schema { inner: schema })

View File

@ -1,12 +1,11 @@
#![allow(clippy::new_ret_no_self)] #![allow(clippy::new_ret_no_self)]
use pyo3::exceptions;
use pyo3::prelude::*;
use tantivy as tv;
use crate::document::Document; use crate::document::Document;
use crate::query::Query; use crate::query::Query;
use crate::to_pyerr;
use pyo3::prelude::*;
use pyo3::{exceptions, PyObjectProtocol};
use tantivy as tv;
/// Tantivy's Searcher class /// Tantivy's Searcher class
/// ///
@ -60,11 +59,11 @@ impl Searcher {
/// ///
/// Returns the Document, raises ValueError if the document can't be found. /// Returns the Document, raises ValueError if the document can't be found.
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> { fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
let ret = self.inner.doc(doc_address.into()); let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
match ret { let named_doc = self.inner.schema().to_named_doc(&doc);
Ok(doc) => Ok(Document { inner: doc }), Ok(Document {
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), field_values: named_doc.0,
} })
} }
} }
@ -132,3 +131,14 @@ impl TopDocs {
Ok(()) Ok(())
} }
} }
#[pyproto]
impl PyObjectProtocol for Searcher {
fn __repr__(&self) -> PyResult<String> {
Ok(format!(
"Searcher(num_docs={}, num_segments={})",
self.inner.num_docs(),
self.inner.segment_readers().len()
))
}
}

View File

@ -1,36 +1,36 @@
import json
import tantivy import tantivy
import pytest import pytest
from tantivy import Document, Index, SchemaBuilder, Schema
def schema():
return SchemaBuilder() \
.add_text_field("title", stored=True) \
.add_text_field("body") \
.build()
@pytest.fixture(scope="class") @pytest.fixture(scope="class")
def ram_index(): def ram_index():
# assume all tests will use the same documents for now # assume all tests will use the same documents for now
# other methods may set up function-local indexes # other methods may set up function-local indexes
builder = tantivy.SchemaBuilder() index = Index(schema())
title = builder.add_text_field("title", stored=True)
body = builder.add_text_field("body")
schema = builder.build()
index = tantivy.Index(schema)
writer = index.writer() writer = index.writer()
# 2 ways of adding documents # 2 ways of adding documents
# 1 # 1
doc = tantivy.Document() doc = Document()
# create a document instance # create a document instance
# add field-value pairs # add field-value pairs
doc.add_text(title, "The Old Man and the Sea") doc.add_text("title", "The Old Man and the Sea")
doc.add_text(body, ("He was an old man who fished alone in a skiff in" doc.add_text("body", ("He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days " "the Gulf Stream and he had gone eighty-four days "
"now without taking a fish.")) "now without taking a fish."))
writer.add_document(doc) writer.add_document(doc)
# 2 use the built-in json support # 2 use the built-in json support
# keys need to coincide with field names # keys need to coincide with field names
doc = schema.parse_document(json.dumps({ doc = Document.from_dict({
"title": "Of Mice and Men", "title": "Of Mice and Men",
"body": ("A few miles south of Soledad, the Salinas River drops " "body": ("A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and " "in close to the hillside bank and runs deep and "
@ -44,149 +44,145 @@ def ram_index():
"junctures the debris of the winters flooding; and " "junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and " "sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool") "branches that arch over the pool")
})) })
writer.add_document(doc) writer.add_document(doc)
writer.add_json("""{
doc = schema.parse_document(json.dumps({
"title": ["Frankenstein", "The Modern Prometheus"], "title": ["Frankenstein", "The Modern Prometheus"],
"body": ("You will rejoice to hear that no disaster has " "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
"accompanied the commencement of an enterprise which you " }""")
"have regarded with such evil forebodings. I arrived "
"here yesterday, and my first task is to assure my dear "
"sister of my welfare and increasing confidence in the "
"success of my undertaking.")
}))
writer.add_document(doc)
writer.commit() writer.commit()
index.reload()
reader = index.reader() return index
searcher = reader.searcher()
index = index
schema = schema
default_args = [title, body]
ret = (index, searcher, schema, default_args, title, body)
return ret
class TestClass(object): class TestClass(object):
def test_simple_search(self, ram_index): def test_simple_search(self, ram_index):
index, searcher, schema, default_args, title, body = ram_index index = ram_index
query_parser = tantivy.QueryParser.for_index(index, default_args) query = index.parse_query("sea whale", ["title", "body"])
query = query_parser.parse_query("sea whale")
top_docs = tantivy.TopDocs(10) top_docs = tantivy.TopDocs(10)
result = searcher.search(query, top_docs) result = index.searcher().search(query, top_docs)
print(result)
assert len(result) == 1 assert len(result) == 1
_, doc_address = result[0] _, doc_address = result[0]
searched_doc = index.searcher().doc(doc_address)
searched_doc = searcher.doc(doc_address) assert searched_doc["title"] == ["The Old Man and the Sea"]
assert searched_doc.get_first(title) == "The Old Man and the Sea"
def test_doc(self):
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
doc = tantivy.Document()
assert doc.is_empty
doc.add_text(title, "The Old Man and the Sea")
assert doc.get_first(title) == "The Old Man and the Sea"
assert doc.len == 1
assert not doc.is_empty
def test_and_query(self, ram_index): def test_and_query(self, ram_index):
index, searcher, schema, default_args, title, body = ram_index index = ram_index
q_parser = tantivy.QueryParser.for_index(index, default_args) query = index.parse_query("title:men AND body:summer", default_field_names=["title", "body"])
# look for an intersection of documents # look for an intersection of documents
query = q_parser.parse_query("title:men AND body:summer")
top_docs = tantivy.TopDocs(10) top_docs = tantivy.TopDocs(10)
searcher = index.searcher()
result = searcher.search(query, top_docs) result = searcher.search(query, top_docs)
print(result)
# summer isn't present # summer isn't present
assert len(result) == 0 assert len(result) == 0
query = q_parser.parse_query("title:men AND body:winter") query = index.parse_query("title:men AND body:winter", ["title", "body"])
result = searcher.search(query, top_docs) result = searcher.search(query, top_docs)
assert len(result) == 1 assert len(result) == 1
def test_and_query_parser_default_fields(self, ram_index):
query = ram_index.parse_query("winter", default_field_names=["title"])
assert repr(query) == """Query(TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114])))"""
def test_and_query_parser_default_fields_undefined(self, ram_index):
query = ram_index.parse_query("winter")
assert repr(query) == "Query(BooleanQuery { subqueries: [" \
"(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " \
"(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \
"})"
def test_query_errors(self, ram_index): def test_query_errors(self, ram_index):
index, searcher, schema, default_args, title, body = ram_index index = ram_index
q_parser = tantivy.QueryParser.for_index(index, default_args)
# no "bod" field # no "bod" field
with pytest.raises(ValueError): with pytest.raises(ValueError):
q_parser.parse_query("bod:title") index.parse_query("bod:men", ["title", "body"])
@pytest.fixture(scope="class") PATH_TO_INDEX = "tests/test_index/"
def disk_index():
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
body = builder.add_text_field("body")
default_args = [title, body]
schema = builder.build()
schema = schema
index = tantivy.Index(schema)
path_to_index = "tests/test_index/"
return index, path_to_index, schema, default_args, title, body
class TestFromDiskClass(object): class TestFromDiskClass(object):
def test_exists(self, disk_index): def test_exists(self):
# prefer to keep it separate in case anyone deletes this # prefer to keep it separate in case anyone deletes this
# runs from the root directory # runs from the root directory
index, path_to_index, _, _, _, _ = disk_index assert Index.exists(PATH_TO_INDEX)
assert index.exists(path_to_index)
def test_opens_from_dir(self, disk_index): def test_opens_from_dir(self):
_, path_to_index, schema, _, _, _ = disk_index index = Index(schema(), PATH_TO_INDEX, reuse=True)
tantivy.Index(schema, path_to_index) assert index.searcher().num_docs == 3
def test_create_readers(self, disk_index): def test_create_readers(self):
_, path_to_index, schema, _, _, _ = disk_index # not sure what is the point of this test.
idx = tantivy.Index(schema, path_to_index) idx = Index(schema())
reload_policy = "OnCommit" # or "Manual" assert idx.searcher().num_docs == 0
assert idx.reader(reload_policy, 4) # by default this is manual mode
assert idx.reader("Manual", 4) writer = idx.writer(30000000, 1)
writer.add_document(Document(title="mytitle", body="mybody"))
writer.commit()
assert idx.searcher().num_docs == 0
# Manual is the default setting.
# In this case, change are reflected only when
# the index is manually reloaded.
idx.reload()
assert idx.searcher().num_docs == 1
idx.config_reader("OnCommit", 4)
writer.add_document(Document(title="mytitle2", body="mybody2"))
writer.commit()
import time
for i in range(50):
# The index should be automatically reloaded.
# Wait for at most 5s for it to happen.
time.sleep(0.1)
if idx.searcher().num_docs == 2:
return
assert False
def test_create_writer_and_reader(self, disk_index):
_, path_to_index, schema, default_args, title, body = disk_index
idx = tantivy.Index(schema, path_to_index)
writer = idx.writer()
reload_policy = "OnCommit" # or "Manual"
reader = idx.reader(reload_policy, 4)
# check against the opstamp in the meta file class TestSearcher(object):
meta_fname = "meta.json" def test_searcher_repr(self, ram_index):
with open("{}{}".format(path_to_index, meta_fname)) as f: assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)"
json_file = json.load(f)
expected_last_opstamp = json_file["opstamp"]
# ASSUMPTION
# We haven't had any deletes in the index
# so max_doc per index coincides with the value of `num_docs`
# summing them in all segments, gives the number of documents
expected_num_docs = sum([segment["max_doc"]
for segment in json_file["segments"]])
assert writer.commit_opstamp == expected_last_opstamp
q_parser = tantivy.QueryParser.for_index(idx, default_args)
# get all documents
query = q_parser.parse_query("*")
top_docs = tantivy.TopDocs(10)
docs = reader.searcher().search(query, top_docs) class TestDocument(object):
for (_score, doc_addr) in docs:
print(reader.searcher().doc(doc_addr)) def test_document(self):
assert expected_num_docs == len(docs) doc = tantivy.Document(name="Bill", reference=[1, 2])
assert doc["reference"] == [1, 2]
assert doc["name"] == ["Bill"]
assert doc.get_first("name") == "Bill"
assert doc.get_first("reference") == 1
assert doc.to_dict() == {"name": ["Bill"], "reference": [1, 2]}
def test_document_with_date(self):
import datetime
date = datetime.datetime(2019, 8, 12, 13, 0, 0, )
doc = tantivy.Document(name="Bill", date=date)
assert doc["date"][0] == date
def test_document_repr(self):
doc = tantivy.Document(name="Bill", reference=[1, 2])
assert repr(doc) == "Document(name=[Bill],reference=[1,2])"
def test_document_with_facet(self):
doc = tantivy.Document()
facet = tantivy.Facet.from_string("/europe/france")
doc.add_facet("facet", facet)
assert doc["facet"][0].to_path() == ['europe', 'france']
doc = tantivy.Document()
facet = tantivy.Facet.from_string("/asia\\/oceania/fiji")
doc.add_facet("facet", facet)
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji"
assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)"
doc = tantivy.Document(facet=facet)
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
def test_document_error(self):
with pytest.raises(ValueError):
tantivy.Document(name={})