From 0498f941b0f9af0fdc8c863c2c9110a206a10184 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 2 Aug 2019 20:23:10 +0900 Subject: [PATCH 1/5] API simplification. See #9 --- Cargo.toml | 1 + README.md | 50 +++---- src/document.rs | 295 +++++++++++++++++++++++++++++++++++------- src/facet.rs | 21 ++- src/field.rs | 73 ----------- src/index.rs | 244 ++++++++++++++++++---------------- src/lib.rs | 14 +- src/query.rs | 64 +-------- src/schema.rs | 57 +------- src/schemabuilder.rs | 82 ++++++------ src/searcher.rs | 30 +++-- tantivy/__init__.py | 2 +- tests/tantivy_test.py | 230 ++++++++++++++++---------------- 13 files changed, 615 insertions(+), 548 deletions(-) delete mode 100644 src/field.rs diff --git a/Cargo.toml b/Cargo.toml index f61f2f7..209807e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ crate-type = ["cdylib"] [dependencies] chrono = "0.4" tantivy = { git = "https://github.com/tantivy-search/tantivy" } +itertools = "0.8" [dependencies.pyo3] version = "0.7.0" diff --git a/README.md b/README.md index b05c741..18e3b8e 100644 --- a/README.md +++ b/README.md @@ -24,35 +24,37 @@ needs to be built. After that documents can be added to the index and a reader can be created to search the index. ```python - builder = tantivy.SchemaBuilder() +import tantivy - title = builder.add_text_field("title", stored=True) - body = builder.add_text_field("body") +# Declaring our schema. +schema_builder = tantivy.SchemaBuilder() +schema_builder.add_text_field("title", stored=True) +schema_builder.add_text_field("body", stored=True) +schema = schema_builder.build() - schema = builder.build() - index = tantivy.Index(schema) +# Creating our index (in memory, but filesystem is available too) +index = tantivy.Index(schema) - writer = index.writer() - doc = tantivy.Document() - doc.add_text(title, "The Old Man and the Sea") - doc.add_text(body, ("He was an old man who fished alone in a skiff in" - "the Gulf Stream and he had gone eighty-four days " - "now without taking a fish.")) - writer.add_document(doc) - writer.commit() - - reader = index.reader() - searcher = reader.searcher() +# Adding one document. +writer = index.writer() +writer.add_document({ + "title": "The Old Man and the Sea", + "body": """He was an old man who fished alone in a skiff in + the Gulf Stream and he had gone eighty-four days + now without taking a fish.""" +}) +# ... and committing +writer.commit() - query_parser = tantivy.QueryParser.for_index(index, [title, body]) - query = query_parser.parse_query("sea whale") - top_docs = tantivy.TopDocs(10) - result = searcher.search(query, top_docs) +# Reload the index to ensure it points to the last commit. +index.reload(); +searcher = index.searcher() +query = index.parse_query("sea whale", ["title", "body"]) +top_docs = tantivy.TopDocs(3) - _, doc_address = result[0] - - searched_doc = searcher.doc(doc_address) - assert searched_doc.get_first(title) == "The Old Man and the Sea" +(best_score, best_doc_address) = searcher.search(query, nhits=3)[0] +best_doc = searcher.doc(best_doc_address) +assert best_doc["title"] == ["The Old Man and the Sea"] ``` diff --git a/src/document.rs b/src/document.rs index af8115c..8264cdb 100644 --- a/src/document.rs +++ b/src/document.rs @@ -1,40 +1,220 @@ #![allow(clippy::new_ret_no_self)] +#![allow(clippy::wrong_self_convention)] +use itertools::Itertools; use pyo3::prelude::*; -use pyo3::types::PyDateTime; +use pyo3::types::{PyAny, PyDateTime, PyDict, PyList, PyTuple}; use pyo3::types::{PyDateAccess, PyTimeAccess}; use chrono::offset::TimeZone; -use chrono::Utc; +use chrono::{Datelike, Timelike, Utc}; use tantivy as tv; use crate::facet::Facet; -use crate::field::{Field, FieldValue}; +use crate::to_pyerr; +use pyo3::{PyMappingProtocol, PyObjectProtocol}; +use std::collections::BTreeMap; +use std::fmt; +use tantivy::schema::Value; + +fn value_to_py(py: Python, value: &Value) -> PyResult { + Ok(match value { + Value::Str(text) => text.into_object(py), + Value::U64(num) => num.into_object(py), + Value::I64(num) => num.into_object(py), + Value::F64(num) => num.into_object(py), + Value::Bytes(b) => b.to_object(py), + Value::Date(d) => PyDateTime::new( + py, + d.year(), + d.month() as u8, + d.day() as u8, + d.hour() as u8, + d.minute() as u8, + d.second() as u8, + d.timestamp_subsec_micros(), + None, + )? + .into_object(py), + Value::Facet(f) => Facet { inner: f.clone() }.into_object(py), + }) +} + +fn value_to_string(value: &Value) -> String { + match value { + Value::Str(text) => text.clone(), + Value::U64(num) => format!("{}", num), + Value::I64(num) => format!("{}", num), + Value::F64(num) => format!("{}", num), + Value::Bytes(bytes) => format!("{:?}", bytes), + Value::Date(d) => format!("{:?}", d), + Value::Facet(facet) => facet.to_string(), + } +} /// Tantivy's Document is the object that can be indexed and then searched for. /// /// Documents are fundamentally a collection of unordered tuples -/// (field, value). In this list, one field may appear more than once. +/// (field_name, value). In this list, one field may appear more than once. /// /// Example: /// >>> doc = tantivy.Document() -/// >>> doc.add_text(title, "The Old Man and the Sea") -/// >>> doc.add_text(body, ("He was an old man who fished alone in a " +/// >>> doc.add_text("title", "The Old Man and the Sea") +/// >>> doc.add_text("body", ("He was an old man who fished alone in a " /// "skiff in the Gulf Stream and he had gone " /// "eighty-four days now without taking a fish.")) +/// +/// For simplicity, it is also possible to build a `Document` by passing the field +/// values directly as constructor arguments. +/// +/// Example: +/// >>> doc = tantivy.Document(title=["The Old Man and the Sea"], body=["..."]) +/// +/// As syntactic sugar, tantivy also allows the user to pass a single values +/// if there is only one. In other words, the following is also legal. +/// +/// Example: +/// >>> doc = tantivy.Document(title="The Old Man and the Sea", body="...") + #[pyclass] +#[derive(Default)] pub(crate) struct Document { - pub(crate) inner: tv::Document, + pub(crate) field_values: BTreeMap>, +} + +impl fmt::Debug for Document { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let doc_str = self + .field_values + .iter() + .map(|(field_name, field_values)| { + let mut values_str = + field_values.iter().map(value_to_string).join(","); + values_str.truncate(10); + format!("{}=[{}]", field_name, values_str) + }) + .join(","); + write!(f, "Document({})", doc_str) + } +} + +fn add_value(doc: &mut Document, field_name: String, value: T) +where + Value: From, +{ + doc.field_values + .entry(field_name) + .or_insert_with(Vec::new) + .push(Value::from(value)); +} + +fn extract_value(any: &PyAny) -> PyResult { + if let Ok(s) = any.extract::() { + return Ok(Value::Str(s)); + } + if let Ok(num) = any.extract::() { + return Ok(Value::I64(num)); + } + if let Ok(num) = any.extract::() { + return Ok(Value::F64(num)); + } + if let Ok(py_datetime) = any.downcast_ref::() { + let datetime = Utc + .ymd( + py_datetime.get_year(), + py_datetime.get_month().into(), + py_datetime.get_day().into(), + ) + .and_hms_micro( + py_datetime.get_hour().into(), + py_datetime.get_minute().into(), + py_datetime.get_second().into(), + py_datetime.get_microsecond(), + ); + return Ok(Value::Date(datetime)); + } + if let Ok(facet) = any.downcast_ref::() { + return Ok(Value::Facet(facet.inner.clone())); + } + Err(to_pyerr(format!("Value unsupported {:?}", any))) +} + +fn extract_value_single_or_list(any: &PyAny) -> PyResult> { + if let Ok(values) = any.downcast_ref::() { + values.iter().map(extract_value).collect() + } else { + Ok(vec![extract_value(any)?]) + } } #[pymethods] impl Document { #[new] - fn new(obj: &PyRawObject) { - obj.init(Document { - inner: tv::Document::default(), - }); + #[args(kwargs = "**")] + fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> { + let mut document = Document::default(); + if let Some(field_dict) = kwargs { + document.extend(field_dict)?; + } + obj.init(document); + Ok(()) + } + + fn extend(&mut self, py_dict: &PyDict) -> PyResult<()> { + let mut field_values: BTreeMap> = + BTreeMap::new(); + for key_value_any in py_dict.items() { + if let Ok(key_value) = key_value_any.downcast_ref::() { + if key_value.len() != 2 { + continue; + } + let key: String = key_value.get_item(0).extract()?; + let value_list = + extract_value_single_or_list(key_value.get_item(1))?; + field_values.insert(key, value_list); + } + } + self.field_values.extend(field_values.into_iter()); + Ok(()) + } + + #[staticmethod] + fn from_dict(py_dict: &PyDict) -> PyResult { + let mut field_values: BTreeMap> = + BTreeMap::new(); + for key_value_any in py_dict.items() { + if let Ok(key_value) = key_value_any.downcast_ref::() { + if key_value.len() != 2 { + continue; + } + let key: String = key_value.get_item(0).extract()?; + let value_list = + extract_value_single_or_list(key_value.get_item(1))?; + field_values.insert(key, value_list); + } + } + Ok(Document { field_values }) + } + + /// Returns a dictionary with the different + /// field values. + /// + /// In tantivy, `Document` can be hold multiple + /// values for a single field. + /// + /// For this reason, the dictionary, will associate + /// a list of value for every field. + fn to_dict(&self, py: Python) -> PyResult { + let dict = PyDict::new(py); + for (key, values) in &self.field_values { + let values_py: Vec = values + .iter() + .map(|v| value_to_py(py, v)) + .collect::>()?; + dict.set_item(key, values_py)?; + } + Ok(dict.into()) } /// Add a text value to the document. @@ -42,8 +222,8 @@ impl Document { /// Args: /// field (Field): The field for which we are adding the text. /// text (str): The text that will be added to the document. - fn add_text(&mut self, field: &Field, text: &str) { - self.inner.add_text(field.inner, text); + fn add_text(&mut self, field_name: String, text: &str) { + add_value(self, field_name, text); } /// Add an unsigned integer value to the document. @@ -51,8 +231,8 @@ impl Document { /// Args: /// field (Field): The field for which we are adding the integer. /// value (int): The integer that will be added to the document. - fn add_unsigned(&mut self, field: &Field, value: u64) { - self.inner.add_u64(field.inner, value); + fn add_unsigned(&mut self, field_name: String, value: u64) { + add_value(self, field_name, value); } /// Add a signed integer value to the document. @@ -60,8 +240,8 @@ impl Document { /// Args: /// field (Field): The field for which we are adding the integer. /// value (int): The integer that will be added to the document. - fn add_integer(&mut self, field: &Field, value: i64) { - self.inner.add_i64(field.inner, value); + fn add_integer(&mut self, field_name: String, value: i64) { + add_value(self, field_name, value); } /// Add a date value to the document. @@ -69,7 +249,7 @@ impl Document { /// Args: /// field (Field): The field for which we are adding the integer. /// value (datetime): The date that will be added to the document. - fn add_date(&mut self, field: &Field, value: &PyDateTime) { + fn add_date(&mut self, field_name: String, value: &PyDateTime) { let datetime = Utc .ymd( value.get_year(), @@ -82,16 +262,15 @@ impl Document { value.get_second().into(), value.get_microsecond(), ); - - self.inner.add_date(field.inner, &datetime); + add_value(self, field_name, datetime); } /// Add a facet value to the document. /// Args: /// field (Field): The field for which we are adding the facet. /// value (Facet): The Facet that will be added to the document. - fn add_facet(&mut self, field: &Field, value: &Facet) { - self.inner.add_facet(field.inner, value.inner.clone()); + fn add_facet(&mut self, field_name: String, facet: &Facet) { + add_value(self, field_name, facet.inner.clone()); } /// Add a bytes value to the document. @@ -99,20 +278,20 @@ impl Document { /// Args: /// field (Field): The field for which we are adding the bytes. /// value (bytes): The bytes that will be added to the document. - fn add_bytes(&mut self, field: &Field, value: Vec) { - self.inner.add_bytes(field.inner, value); + fn add_bytes(&mut self, field_name: String, bytes: Vec) { + add_value(self, field_name, bytes); } /// Returns the number of added fields that have been added to the document #[getter] - fn len(&self) -> usize { - self.inner.len() + fn num_fields(&self) -> usize { + self.field_values.len() } /// True if the document is empty, False otherwise. #[getter] fn is_empty(&self) -> bool { - self.inner.is_empty() + self.field_values.is_empty() } /// Get the first value associated with the given field. @@ -122,9 +301,17 @@ impl Document { /// /// Returns the value if one is found, otherwise None. /// The type of the value depends on the field. - fn get_first(&self, py: Python, field: &Field) -> Option { - let value = self.inner.get_first(field.inner)?; - FieldValue::value_to_py(py, value) + fn get_first( + &self, + py: Python, + fieldname: &str, + ) -> PyResult> { + if let Some(value) = self.iter_values_for_field(fieldname).next() { + let py_value = value_to_py(py, value)?; + Ok(Some(py_value)) + } else { + Ok(None) + } } /// Get the all values associated with the given field. @@ -134,21 +321,37 @@ impl Document { /// /// Returns a list of values. /// The type of the value depends on the field. - fn get_all(&self, py: Python, field: &Field) -> Vec { - let values = self.inner.get_all(field.inner); - values - .iter() - .map(|&v| FieldValue::value_to_py(py, v)) - .filter_map(|x| x) - .collect() - } - - /// Get all the fields and values contained in the document. - fn field_values(&self, py: Python) -> Vec { - let field_values = self.inner.field_values(); - field_values - .iter() - .map(|v| FieldValue::field_value_to_py(py, v)) - .collect() + fn get_all(&self, py: Python, field_name: &str) -> PyResult> { + self.iter_values_for_field(field_name) + .map(|value| value_to_py(py, value)) + .collect::>>() + } +} + +impl Document { + fn iter_values_for_field<'a>( + &'a self, + field: &str, + ) -> impl Iterator + 'a { + self.field_values + .get(field) + .into_iter() + .flat_map(|values| values.iter()) + } +} + +#[pyproto] +impl PyMappingProtocol for Document { + fn __getitem__(&self, field_name: &str) -> PyResult> { + let gil = Python::acquire_gil(); + let py = gil.python(); + self.get_all(py, field_name) + } +} + +#[pyproto] +impl PyObjectProtocol for Document { + fn __repr__(&self) -> PyResult { + Ok(format!("{:?}", self)) } } diff --git a/src/facet.rs b/src/facet.rs index de7e930..1686a7c 100644 --- a/src/facet.rs +++ b/src/facet.rs @@ -1,6 +1,6 @@ +use pyo3::basic::PyObjectProtocol; use pyo3::prelude::*; use pyo3::types::PyType; - use tantivy::schema; /// A Facet represent a point in a given hierarchy. @@ -52,4 +52,23 @@ impl Facet { inner: schema::Facet::from_text(facet_string), } } + + /// Returns the list of `segments` that forms a facet path. + /// + /// For instance `//europe/france` becomes `["europe", "france"]`. + fn to_path(&self) -> Vec<&str> { + self.inner.to_path() + } + + /// Returns the facet string representation. + fn to_path_str(&self) -> String { + self.inner.to_string() + } +} + +#[pyproto] +impl PyObjectProtocol for Facet { + fn __repr__(&self) -> PyResult { + Ok(format!("Facet({})", self.to_path_str())) + } } diff --git a/src/field.rs b/src/field.rs deleted file mode 100644 index af46f53..0000000 --- a/src/field.rs +++ /dev/null @@ -1,73 +0,0 @@ -use pyo3::prelude::*; -use pyo3::types::PyDateTime; - -use tantivy::schema; - -use crate::facet::Facet; - -/// Field is a numeric indentifier that represents an entry in the Schema. -#[pyclass] -#[derive(Clone)] -pub(crate) struct Field { - pub(crate) inner: schema::Field, -} - -/// FieldValue holds together a Field and its Value. -#[pyclass] -pub(crate) struct FieldValue { - pub(crate) field: Field, - pub(crate) value: PyObject, -} - -#[pymethods] -impl FieldValue { - #[getter] - fn field(&self) -> Field { - self.field.clone() - } - - #[getter] - fn value(&self) -> &PyObject { - &self.value - } -} - -impl FieldValue { - pub(crate) fn value_to_py( - py: Python, - value: &schema::Value, - ) -> Option { - match value { - schema::Value::Str(text) => Some(text.into_object(py)), - schema::Value::U64(num) => Some(num.into_object(py)), - schema::Value::I64(num) => Some(num.into_object(py)), - schema::Value::F64(num) => Some(num.into_object(py)), - schema::Value::Bytes(b) => Some(b.to_object(py)), - schema::Value::Date(d) => { - let date = - PyDateTime::from_timestamp(py, d.timestamp() as f64, None); - - match date { - Ok(d) => Some(d.into_object(py)), - Err(_e) => None, - } - } - schema::Value::Facet(f) => { - Some(Facet { inner: f.clone() }.into_object(py)) - } - } - } - - pub(crate) fn field_value_to_py( - py: Python, - field_value: &schema::FieldValue, - ) -> FieldValue { - let value = field_value.value(); - let field = field_value.field(); - - FieldValue { - field: Field { inner: field }, - value: FieldValue::value_to_py(py, value).unwrap(), - } - } -} diff --git a/src/index.rs b/src/index.rs index b3fa7b2..65f88ff 100644 --- a/src/index.rs +++ b/src/index.rs @@ -4,60 +4,24 @@ use pyo3::exceptions; use pyo3::prelude::*; use crate::document::Document; +use crate::query::Query; use crate::schema::Schema; use crate::searcher::Searcher; +use crate::to_pyerr; use tantivy as tv; use tantivy::directory::MmapDirectory; +use tantivy::schema::{Field, NamedFieldDocument}; const RELOAD_POLICY: &str = "commit"; -/// IndexReader is the entry point to read and search the index. -/// -/// IndexReader controls when a new version of the index should be loaded and -/// lends you instances of Searcher for the last loaded version. -/// -/// To create an IndexReader first create an Index and call the reader() method -/// on the index object. -#[pyclass] -pub(crate) struct IndexReader { - inner: tv::IndexReader, -} - -#[pymethods] -impl IndexReader { - /// Update searchers so that they reflect the state of the last .commit(). - /// - /// If you set up the the reload policy to be on 'commit' (which is the - /// default) every commit should be rapidly reflected on your IndexReader - /// and you should not need to call reload() at all. - fn reload(&self) -> PyResult<()> { - let ret = self.inner.reload(); - match ret { - Ok(_) => Ok(()), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), - } - } - - /// Get a Searcher for the index. - /// - /// This method should be called every single time a search query is - /// performed. The searchers are taken from a pool of num_searchers - /// searchers. - /// - /// Returns a Searcher object, if no searcher is available this may block. - fn searcher(&self) -> Searcher { - let searcher = self.inner.searcher(); - Searcher { inner: searcher } - } -} - /// IndexWriter is the user entry-point to add documents to the index. /// /// To create an IndexWriter first create an Index and call the writer() method /// on the index object. #[pyclass] pub(crate) struct IndexWriter { - inner: tv::IndexWriter, + inner_index_writer: tv::IndexWriter, + schema: tv::schema::Schema, } #[pymethods] @@ -70,9 +34,24 @@ impl IndexWriter { /// by the client to align commits with its own document queue. /// The `opstamp` represents the number of documents that have been added /// since the creation of the index. - fn add_document(&mut self, document: &Document) -> PyResult<()> { - self.inner.add_document(document.inner.clone()); - Ok(()) + pub fn add_document(&mut self, doc: &Document) -> PyResult { + let named_doc = NamedFieldDocument(doc.field_values.clone()); + let doc = self.schema.convert_named_doc(named_doc).map_err(to_pyerr)?; + Ok(self.inner_index_writer.add_document(doc)) + } + + /// Helper for the `add_document` method, but passing a json string. + /// + /// If the indexing pipeline is full, this call may block. + /// + /// Returns an `opstamp`, which is an increasing integer that can be used + /// by the client to align commits with its own document queue. + /// The `opstamp` represents the number of documents that have been added + /// since the creation of the index. + pub fn add_json(&mut self, json: &str) -> PyResult { + let doc = self.schema.parse_document(json).map_err(to_pyerr)?; + let opstamp = self.inner_index_writer.add_document(doc); + Ok(opstamp) } /// Commits all of the pending changes @@ -84,12 +63,8 @@ impl IndexWriter { /// spared), it will be possible to resume indexing from this point. /// /// Returns the `opstamp` of the last document that made it in the commit. - fn commit(&mut self) -> PyResult<()> { - let ret = self.inner.commit(); - match ret { - Ok(_) => Ok(()), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), - } + fn commit(&mut self) -> PyResult { + self.inner_index_writer.commit().map_err(to_pyerr) } /// Rollback to the last commit @@ -97,23 +72,15 @@ impl IndexWriter { /// This cancels all of the update that happened before after the last /// commit. After calling rollback, the index is in the same state as it /// was after the last commit. - fn rollback(&mut self) -> PyResult<()> { - let ret = self.inner.rollback(); - - match ret { - Ok(_) => Ok(()), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), - } + fn rollback(&mut self) -> PyResult { + self.inner_index_writer.rollback().map_err(to_pyerr) } /// Detect and removes the files that are not used by the index anymore. fn garbage_collect_files(&mut self) -> PyResult<()> { - let ret = self.inner.garbage_collect_files(); - - match ret { - Ok(_) => Ok(()), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), - } + self.inner_index_writer + .garbage_collect_files() + .map_err(to_pyerr) } /// The opstamp of the last successful commit. @@ -125,7 +92,7 @@ impl IndexWriter { /// for searchers. #[getter] fn commit_opstamp(&self) -> u64 { - self.inner.commit_opstamp() + self.inner_index_writer.commit_opstamp() } } @@ -142,11 +109,19 @@ impl IndexWriter { /// if there was a problem during the opening or creation of the index. #[pyclass] pub(crate) struct Index { - pub(crate) inner: tv::Index, + pub(crate) index: tv::Index, + reader: tv::IndexReader, } #[pymethods] impl Index { + #[staticmethod] + fn open(path: &str) -> PyResult { + let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?; + let reader = index.reader().map_err(to_pyerr)?; + Ok(Index { index, reader }) + } + #[new] #[args(reuse = true)] fn new( @@ -157,32 +132,20 @@ impl Index { ) -> PyResult<()> { let index = match path { Some(p) => { - let directory = MmapDirectory::open(p); - - let dir = match directory { - Ok(d) => d, - Err(e) => { - return Err(exceptions::OSError::py_err(e.to_string())) - } - }; - - let i = if reuse { - tv::Index::open_or_create(dir, schema.inner.clone()) + let directory = MmapDirectory::open(p).map_err(to_pyerr)?; + if reuse { + tv::Index::open_or_create(directory, schema.inner.clone()) } else { - tv::Index::create(dir, schema.inner.clone()) - }; - - match i { - Ok(index) => index, - Err(e) => { - return Err(exceptions::OSError::py_err(e.to_string())) - } + tv::Index::create(directory, schema.inner.clone()) } + .map_err(to_pyerr)? } None => tv::Index::create_in_ram(schema.inner.clone()), }; - obj.init(Index { inner: index }); + let reader = index.reader().map_err(to_pyerr)?; + println!("reader {}", reader.searcher().segment_readers().len()); + obj.init(Index { index, reader }); Ok(()) } @@ -206,32 +169,30 @@ impl Index { num_threads: usize, ) -> PyResult { let writer = match num_threads { - 0 => self.inner.writer(heap_size), - _ => self.inner.writer_with_num_threads(num_threads, heap_size), - }; - - match writer { - Ok(w) => Ok(IndexWriter { inner: w }), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + 0 => self.index.writer(heap_size), + _ => self.index.writer_with_num_threads(num_threads, heap_size), } + .map_err(to_pyerr)?; + let schema = self.index.schema(); + Ok(IndexWriter { + inner_index_writer: writer, + schema, + }) } - /// Create an IndexReader for the index. + /// Configure the index reader. /// /// Args: /// reload_policy (str, optional): The reload policy that the - /// IndexReader should use. Can be manual or OnCommit. + /// IndexReader should use. Can be `Manual` or `OnCommit`. /// num_searchers (int, optional): The number of searchers that the /// reader should create. - /// - /// Returns the IndexReader on success, raises ValueError if a IndexReader - /// couldn't be created. #[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)] - fn reader( - &self, + fn config_reader( + &mut self, reload_policy: &str, num_searchers: usize, - ) -> PyResult { + ) -> Result<(), PyErr> { let reload_policy = reload_policy.to_lowercase(); let reload_policy = match reload_policy.as_ref() { "commit" => tv::ReloadPolicy::OnCommit, @@ -242,9 +203,7 @@ impl Index { "Invalid reload policy, valid choices are: 'manual' and 'OnCommit'" )) }; - - let builder = self.inner.reader_builder(); - + let builder = self.index.reader_builder(); let builder = builder.reload_policy(reload_policy); let builder = if num_searchers > 0 { builder.num_searchers(num_searchers) @@ -252,10 +211,13 @@ impl Index { builder }; - let reader = builder.try_into(); - match reader { - Ok(r) => Ok(IndexReader { inner: r }), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + self.reader = builder.try_into().map_err(to_pyerr)?; + Ok(()) + } + + fn searcher(&self) -> Searcher { + Searcher { + inner: self.reader.searcher(), } } @@ -268,19 +230,71 @@ impl Index { /// Raises OSError if the directory cannot be opened. #[staticmethod] fn exists(path: &str) -> PyResult { - let directory = MmapDirectory::open(path); - let dir = match directory { - Ok(d) => d, - Err(e) => return Err(exceptions::OSError::py_err(e.to_string())), - }; - - Ok(tv::Index::exists(&dir)) + let directory = MmapDirectory::open(path).map_err(to_pyerr)?; + Ok(tv::Index::exists(&directory)) } /// The schema of the current index. #[getter] fn schema(&self) -> Schema { - let schema = self.inner.schema(); + let schema = self.index.schema(); Schema { inner: schema } } + + /// Update searchers so that they reflect the state of the last .commit(). + /// + /// If you set up the the reload policy to be on 'commit' (which is the + /// default) every commit should be rapidly reflected on your IndexReader + /// and you should not need to call reload() at all. + fn reload(&self) -> PyResult<()> { + self.reader.reload().map_err(to_pyerr) + } + + /// Parse a query + /// + /// Args: + /// query: the query, following the tantivy query language. + /// default_fields (List[Field]): A list of fields used to search if no + /// field is specified in the query. + /// + #[args(reload_policy = "RELOAD_POLICY")] + pub fn parse_query( + &self, + query: &str, + default_field_names: Option>, + ) -> PyResult { + let mut default_fields = vec![]; + let schema = self.index.schema(); + if let Some(default_field_names_vec) = default_field_names { + for default_field_name in &default_field_names_vec { + if let Some(field) = schema.get_field(default_field_name) { + let field_entry = schema.get_field_entry(field); + if !field_entry.is_indexed() { + return Err(exceptions::ValueError::py_err(format!( + "Field `{}` is not set as indexed in the schema.", + default_field_name + ))); + } + default_fields.push(field); + } else { + return Err(exceptions::ValueError::py_err(format!( + "Field `{}` is not defined in the schema.", + default_field_name + ))); + } + } + } else { + for (field_id, field_entry) in + self.index.schema().fields().iter().enumerate() + { + if field_entry.is_indexed() { + default_fields.push(Field(field_id as u32)); + } + } + } + let parser = + tv::query::QueryParser::for_index(&self.index, default_fields); + let query = parser.parse_query(query).map_err(to_pyerr)?; + Ok(Query { inner: query }) + } } diff --git a/src/lib.rs b/src/lib.rs index 697c40b..5cb0826 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,8 @@ +use pyo3::exceptions; use pyo3::prelude::*; mod document; mod facet; -mod field; mod index; mod query; mod schema; @@ -11,9 +11,7 @@ mod searcher; use document::Document; use facet::Facet; -use field::{Field, FieldValue}; use index::Index; -use query::QueryParser; use schema::Schema; use schemabuilder::SchemaBuilder; use searcher::{DocAddress, Searcher, TopDocs}; @@ -75,14 +73,14 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; - m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; - m.add_class::()?; m.add_class::()?; - Ok(()) } + +pub(crate) fn to_pyerr(err: E) -> PyErr { + exceptions::ValueError::py_err(err.to_string()) +} diff --git a/src/query.rs b/src/query.rs index fc00734..027c453 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,70 +1,16 @@ -use pyo3::exceptions; use pyo3::prelude::*; -use pyo3::types::PyType; - +use pyo3::PyObjectProtocol; use tantivy as tv; -use crate::field::Field; -use crate::index::Index; - /// Tantivy's Query #[pyclass] pub(crate) struct Query { pub(crate) inner: Box, } -/// Tantivy's Query parser -#[pyclass] -pub(crate) struct QueryParser { - inner: tv::query::QueryParser, -} - -#[pymethods] -impl QueryParser { - /// Creates a QueryParser for an Index. - /// - /// Args: - /// index (Index): The index for which the query will be created. - /// default_fields (List[Field]): A list of fields used to search if no - /// field is specified in the query. - /// - /// Returns the QueryParser. - #[classmethod] - fn for_index( - _cls: &PyType, - index: &Index, - default_fields: Vec<&Field>, - ) -> PyResult { - let default_fields: Vec = - default_fields.iter().map(|&f| f.inner).collect(); - - let parser = - tv::query::QueryParser::for_index(&index.inner, default_fields); - Ok(QueryParser { inner: parser }) - } - - /// Parse a string into a query that can be given to a searcher. - /// - /// Args: - /// query (str): A query string that should be parsed into a query. - /// - /// Returns the parsed Query object. Raises ValueError if there was an - /// error with the query string. - fn parse_query(&self, query: &str) -> PyResult { - let ret = self.inner.parse_query(query); - - match ret { - Ok(q) => Ok(Query { inner: q }), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), - } - } - - /// Set the default way to compose queries to a conjunction. - /// - /// By default, the query happy tax payer is equivalent to the query happy - /// OR tax OR payer. After calling .set_conjunction_by_default() happy tax - /// payer will be interpreted by the parser as happy AND tax AND payer. - fn set_conjunction_by_default(&mut self) { - self.inner.set_conjunction_by_default(); +#[pyproto] +impl PyObjectProtocol for Query { + fn __repr__(&self) -> PyResult { + Ok(format!("Query({:?})", self.inner)) } } diff --git a/src/schema.rs b/src/schema.rs index 5ced6cf..00d0c53 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -1,10 +1,5 @@ -use pyo3::exceptions; use pyo3::prelude::*; - -use tantivy::schema; - -use crate::document::Document; -use crate::field::Field; +use tantivy as tv; /// Tantivy schema. /// @@ -12,54 +7,8 @@ use crate::field::Field; /// provided. #[pyclass] pub(crate) struct Schema { - pub(crate) inner: schema::Schema, + pub(crate) inner: tv::schema::Schema, } #[pymethods] -impl Schema { - /// Build a document object from a json string. - /// - /// Args: - /// doc_json (str) - A string containing json that should be parsed - /// into a `Document` - /// - /// Returns the parsed document, raises a ValueError if the parsing failed. - fn parse_document(&self, doc_json: &str) -> PyResult { - let ret = self.inner.parse_document(doc_json); - match ret { - Ok(d) => Ok(Document { inner: d }), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), - } - } - - /// Convert a `Document` object into a json string. - /// - /// Args: - /// doc (Document): The document that will be converted into a json - /// string. - fn to_json(&self, doc: &Document) -> String { - self.inner.to_json(&doc.inner) - } - - /// Return the field name for a given `Field`. - /// - /// Args: - /// field (Field): The field for which the name will be returned. - fn get_field_name(&self, field: &Field) -> &str { - self.inner.get_field_name(field.inner) - } - - /// Returns the field option associated with a given name. - /// - /// Args: - /// name (str): The name of the field that we want to retrieve. - /// - /// Returns the Field if one is found, None otherwise. - fn get_field(&self, name: &str) -> Option { - let f = self.inner.get_field(name); - match f { - Some(field) => Some(Field { inner: field }), - None => None, - } - } -} +impl Schema {} diff --git a/src/schemabuilder.rs b/src/schemabuilder.rs index e44f7de..95233c7 100644 --- a/src/schemabuilder.rs +++ b/src/schemabuilder.rs @@ -5,8 +5,8 @@ use pyo3::prelude::*; use tantivy::schema; -use crate::field::Field; use crate::schema::Schema; +use std::sync::{Arc, RwLock}; /// Tantivy has a very strict schema. /// You need to specify in advance whether a field is indexed or not, @@ -24,8 +24,9 @@ use crate::schema::Schema; /// /// >>> schema = builder.build() #[pyclass] +#[derive(Clone)] pub(crate) struct SchemaBuilder { - pub(crate) builder: Option, + pub(crate) builder: Arc>>, } const TOKENIZER: &str = "default"; @@ -36,7 +37,7 @@ impl SchemaBuilder { #[new] fn new(obj: &PyRawObject) { obj.init(SchemaBuilder { - builder: Some(schema::Schema::builder()), + builder: Arc::new(From::from(Some(schema::Schema::builder()))), }); } @@ -70,9 +71,8 @@ impl SchemaBuilder { stored: bool, tokenizer_name: &str, index_option: &str, - ) -> PyResult { + ) -> PyResult { let builder = &mut self.builder; - let index_option = match index_option { "position" => schema::IndexRecordOption::WithFreqsAndPositions, "freq" => schema::IndexRecordOption::WithFreqs, @@ -94,14 +94,14 @@ impl SchemaBuilder { options }; - if let Some(builder) = builder { - let field = builder.add_text_field(name, options); - Ok(Field { inner: field }) + if let Some(builder) = builder.write().unwrap().as_mut() { + builder.add_text_field(name, options); } else { - Err(exceptions::ValueError::py_err( + return Err(exceptions::ValueError::py_err( "Schema builder object isn't valid anymore.", - )) + )); } + Ok(self.clone()) } /// Add a new signed integer field to the schema. @@ -131,19 +131,19 @@ impl SchemaBuilder { stored: bool, indexed: bool, fast: Option<&str>, - ) -> PyResult { + ) -> PyResult { let builder = &mut self.builder; let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; - if let Some(builder) = builder { - let field = builder.add_i64_field(name, opts); - Ok(Field { inner: field }) + if let Some(builder) = builder.write().unwrap().as_mut() { + builder.add_i64_field(name, opts); } else { - Err(exceptions::ValueError::py_err( + return Err(exceptions::ValueError::py_err( "Schema builder object isn't valid anymore.", - )) + )); } + Ok(self.clone()) } /// Add a new unsigned integer field to the schema. @@ -173,19 +173,19 @@ impl SchemaBuilder { stored: bool, indexed: bool, fast: Option<&str>, - ) -> PyResult { + ) -> PyResult { let builder = &mut self.builder; let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; - if let Some(builder) = builder { - let field = builder.add_u64_field(name, opts); - Ok(Field { inner: field }) + if let Some(builder) = builder.write().unwrap().as_mut() { + builder.add_u64_field(name, opts); } else { - Err(exceptions::ValueError::py_err( + return Err(exceptions::ValueError::py_err( "Schema builder object isn't valid anymore.", - )) + )); } + Ok(self.clone()) } /// Add a new date field to the schema. @@ -215,35 +215,35 @@ impl SchemaBuilder { stored: bool, indexed: bool, fast: Option<&str>, - ) -> PyResult { + ) -> PyResult { let builder = &mut self.builder; let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; - if let Some(builder) = builder { - let field = builder.add_date_field(name, opts); - Ok(Field { inner: field }) + if let Some(builder) = builder.write().unwrap().as_mut() { + builder.add_date_field(name, opts); } else { - Err(exceptions::ValueError::py_err( + return Err(exceptions::ValueError::py_err( "Schema builder object isn't valid anymore.", - )) + )); } + Ok(self.clone()) } /// Add a Facet field to the schema. /// Args: /// name (str): The name of the field. - fn add_facet_field(&mut self, name: &str) -> PyResult { + fn add_facet_field(&mut self, name: &str) -> PyResult { let builder = &mut self.builder; - if let Some(builder) = builder { - let field = builder.add_facet_field(name); - Ok(Field { inner: field }) + if let Some(builder) = builder.write().unwrap().as_mut() { + builder.add_facet_field(name); } else { - Err(exceptions::ValueError::py_err( + return Err(exceptions::ValueError::py_err( "Schema builder object isn't valid anymore.", - )) + )); } + Ok(self.clone()) } /// Add a fast bytes field to the schema. @@ -254,17 +254,17 @@ impl SchemaBuilder { /// /// Args: /// name (str): The name of the field. - fn add_bytes_field(&mut self, name: &str) -> PyResult { + fn add_bytes_field(&mut self, name: &str) -> PyResult { let builder = &mut self.builder; - if let Some(builder) = builder { - let field = builder.add_bytes_field(name); - Ok(Field { inner: field }) + if let Some(builder) = builder.write().unwrap().as_mut() { + builder.add_bytes_field(name); } else { - Err(exceptions::ValueError::py_err( + return Err(exceptions::ValueError::py_err( "Schema builder object isn't valid anymore.", - )) + )); } + Ok(self.clone()) } /// Finalize the creation of a Schema. @@ -272,7 +272,7 @@ impl SchemaBuilder { /// Returns a Schema object. After this is called the SchemaBuilder cannot /// be used anymore. fn build(&mut self) -> PyResult { - let builder = self.builder.take(); + let builder = self.builder.write().unwrap().take(); if let Some(builder) = builder { let schema = builder.build(); Ok(Schema { inner: schema }) diff --git a/src/searcher.rs b/src/searcher.rs index 6d6ec0c..b336ab7 100644 --- a/src/searcher.rs +++ b/src/searcher.rs @@ -1,12 +1,11 @@ #![allow(clippy::new_ret_no_self)] -use pyo3::exceptions; -use pyo3::prelude::*; - -use tantivy as tv; - use crate::document::Document; use crate::query::Query; +use crate::to_pyerr; +use pyo3::prelude::*; +use pyo3::{exceptions, PyObjectProtocol}; +use tantivy as tv; /// Tantivy's Searcher class /// @@ -60,11 +59,11 @@ impl Searcher { /// /// Returns the Document, raises ValueError if the document can't be found. fn doc(&self, doc_address: &DocAddress) -> PyResult { - let ret = self.inner.doc(doc_address.into()); - match ret { - Ok(doc) => Ok(Document { inner: doc }), - Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), - } + let doc = self.inner.doc(doc_address.into()).map_err(to_pyerr)?; + let named_doc = self.inner.schema().to_named_doc(&doc); + Ok(Document { + field_values: named_doc.0, + }) } } @@ -132,3 +131,14 @@ impl TopDocs { Ok(()) } } + +#[pyproto] +impl PyObjectProtocol for Searcher { + fn __repr__(&self) -> PyResult { + Ok(format!( + "Searcher(num_docs={}, num_segments={})", + self.inner.num_docs(), + self.inner.segment_readers().len() + )) + } +} diff --git a/tantivy/__init__.py b/tantivy/__init__.py index b462ae2..c7c118e 100644 --- a/tantivy/__init__.py +++ b/tantivy/__init__.py @@ -1 +1 @@ -from .tantivy import * +from .tantivy import * \ No newline at end of file diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index 6da61aa..e9a1b35 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -1,36 +1,34 @@ -import json import tantivy - import pytest +from tantivy import Document, Index, SchemaBuilder, Schema + +def schema(): + return SchemaBuilder()\ + .add_text_field("title", stored=True)\ + .add_text_field("body")\ + .build() @pytest.fixture(scope="class") def ram_index(): # assume all tests will use the same documents for now # other methods may set up function-local indexes - builder = tantivy.SchemaBuilder() - - title = builder.add_text_field("title", stored=True) - body = builder.add_text_field("body") - - schema = builder.build() - index = tantivy.Index(schema) - + index = Index(schema()) writer = index.writer() # 2 ways of adding documents # 1 - doc = tantivy.Document() + doc = Document() # create a document instance # add field-value pairs - doc.add_text(title, "The Old Man and the Sea") - doc.add_text(body, ("He was an old man who fished alone in a skiff in" + doc.add_text("title", "The Old Man and the Sea") + doc.add_text("body", ("He was an old man who fished alone in a skiff in" "the Gulf Stream and he had gone eighty-four days " "now without taking a fish.")) writer.add_document(doc) # 2 use the built-in json support # keys need to coincide with field names - doc = schema.parse_document(json.dumps({ + doc = Document.from_dict({ "title": "Of Mice and Men", "body": ("A few miles south of Soledad, the Salinas River drops " "in close to the hillside bank and runs deep and " @@ -44,149 +42,149 @@ def ram_index(): "junctures the debris of the winter’s flooding; and " "sycamores with mottled, white, recumbent limbs and " "branches that arch over the pool") - })) - + }) writer.add_document(doc) - - doc = schema.parse_document(json.dumps({ + + writer.add_json("""{ "title": ["Frankenstein", "The Modern Prometheus"], - "body": ("You will rejoice to hear that no disaster has " - "accompanied the commencement of an enterprise which you " - "have regarded with such evil forebodings. I arrived " - "here yesterday, and my first task is to assure my dear " - "sister of my welfare and increasing confidence in the " - "success of my undertaking.") - })) - - writer.add_document(doc) + "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." + }""") + writer.commit() + index.reload() + return index - reader = index.reader() - searcher = reader.searcher() - index = index - schema = schema - default_args = [title, body] - ret = (index, searcher, schema, default_args, title, body) - return ret class TestClass(object): def test_simple_search(self, ram_index): - index, searcher, schema, default_args, title, body = ram_index - query_parser = tantivy.QueryParser.for_index(index, default_args) - query = query_parser.parse_query("sea whale") + index = ram_index + query = index.parse_query("sea whale", ["title", "body"]) top_docs = tantivy.TopDocs(10) - result = searcher.search(query, top_docs) - print(result) - + result = index.searcher().search(query, top_docs) assert len(result) == 1 - _, doc_address = result[0] - - searched_doc = searcher.doc(doc_address) - assert searched_doc.get_first(title) == "The Old Man and the Sea" - - def test_doc(self): - builder = tantivy.SchemaBuilder() - title = builder.add_text_field("title", stored=True) - - doc = tantivy.Document() - assert doc.is_empty - - doc.add_text(title, "The Old Man and the Sea") - - assert doc.get_first(title) == "The Old Man and the Sea" - - assert doc.len == 1 - assert not doc.is_empty + searched_doc = index.searcher().doc(doc_address) + assert searched_doc["title"] == ["The Old Man and the Sea"] def test_and_query(self, ram_index): - index, searcher, schema, default_args, title, body = ram_index - q_parser = tantivy.QueryParser.for_index(index, default_args) + index = ram_index + query = index.parse_query("title:men AND body:summer", default_field_names=["title", "body"]) # look for an intersection of documents - query = q_parser.parse_query("title:men AND body:summer") top_docs = tantivy.TopDocs(10) - + searcher = index.searcher() result = searcher.search(query, top_docs) - print(result) # summer isn't present assert len(result) == 0 - query = q_parser.parse_query("title:men AND body:winter") + query = index.parse_query("title:men AND body:winter", ["title", "body"]) result = searcher.search(query, top_docs) assert len(result) == 1 + + def test_and_query_parser_default_fields(self, ram_index): + query = ram_index.parse_query("winter", default_field_names=["title"]) + assert repr(query) == """Query(TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114])))""" + + def test_and_query_parser_default_fields_undefined(self, ram_index): + query = ram_index.parse_query("winter") + assert repr(query) == "Query(BooleanQuery { subqueries: [" \ + "(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " \ + "(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \ + "})" + + def test_query_errors(self, ram_index): - index, searcher, schema, default_args, title, body = ram_index - q_parser = tantivy.QueryParser.for_index(index, default_args) + index = ram_index # no "bod" field with pytest.raises(ValueError): - q_parser.parse_query("bod:title") + index.parse_query("bod:men", ["title", "body"]) -@pytest.fixture(scope="class") -def disk_index(): - builder = tantivy.SchemaBuilder() - title = builder.add_text_field("title", stored=True) - body = builder.add_text_field("body") - default_args = [title, body] - schema = builder.build() - schema = schema - index = tantivy.Index(schema) - path_to_index = "tests/test_index/" - return index, path_to_index, schema, default_args, title, body +PATH_TO_INDEX = "tests/test_index/" class TestFromDiskClass(object): - def test_exists(self, disk_index): + def test_exists(self): # prefer to keep it separate in case anyone deletes this # runs from the root directory - index, path_to_index, _, _, _, _ = disk_index - assert index.exists(path_to_index) + assert Index.exists(PATH_TO_INDEX) - def test_opens_from_dir(self, disk_index): - _, path_to_index, schema, _, _, _ = disk_index - tantivy.Index(schema, path_to_index) + def test_opens_from_dir(self): + index = Index(schema(), PATH_TO_INDEX, reuse=True) + assert index.searcher().num_docs == 3 - def test_create_readers(self, disk_index): - _, path_to_index, schema, _, _, _ = disk_index - idx = tantivy.Index(schema, path_to_index) - reload_policy = "OnCommit" # or "Manual" - assert idx.reader(reload_policy, 4) - assert idx.reader("Manual", 4) + def test_create_readers(self): + # not sure what is the point of this test. + idx = Index(schema()) + assert idx.searcher().num_docs == 0 + # by default this is manual mode + writer = idx.writer(30000000, 1) + writer.add_document(Document(title="mytitle",body="mybody")) + writer.commit() + assert idx.searcher().num_docs == 0 + # Manual is the default setting. + # In this case, change are reflected only when + # the index is manually reloaded. + idx.reload() + assert idx.searcher().num_docs == 1 + idx.config_reader("OnCommit", 4) + writer.add_document(Document(title="mytitle2",body="mybody2")) + writer.commit() + import time + for i in range(50): + # The index should be automatically reloaded. + # Wait for at most 5s for it to happen. + time.sleep(0.1) + if idx.searcher().num_docs == 2: + return + assert False - def test_create_writer_and_reader(self, disk_index): - _, path_to_index, schema, default_args, title, body = disk_index - idx = tantivy.Index(schema, path_to_index) - writer = idx.writer() - reload_policy = "OnCommit" # or "Manual" - reader = idx.reader(reload_policy, 4) +class TestSearcher(object): + def test_searcher_repr(self, ram_index): + assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)" - # check against the opstamp in the meta file - meta_fname = "meta.json" - with open("{}{}".format(path_to_index, meta_fname)) as f: - json_file = json.load(f) - expected_last_opstamp = json_file["opstamp"] - # ASSUMPTION - # We haven't had any deletes in the index - # so max_doc per index coincides with the value of `num_docs` - # summing them in all segments, gives the number of documents - expected_num_docs = sum([segment["max_doc"] - for segment in json_file["segments"]]) - assert writer.commit_opstamp == expected_last_opstamp - q_parser = tantivy.QueryParser.for_index(idx, default_args) - # get all documents - query = q_parser.parse_query("*") - top_docs = tantivy.TopDocs(10) +class TestDocument(object): - docs = reader.searcher().search(query, top_docs) - for (_score, doc_addr) in docs: - print(reader.searcher().doc(doc_addr)) - assert expected_num_docs == len(docs) + def test_document(self): + doc = tantivy.Document(name="Bill", reference=[1, 2]) + assert doc["reference"] == [1, 2] + assert doc["name"] == ["Bill"] + assert doc.get_first("name") == "Bill" + assert doc.get_first("reference") == 1 + assert doc.to_dict() == {"name": ["Bill"], "reference": [1, 2]} + + def test_document_with_date(self): + import datetime + date = datetime.datetime(2019, 8, 12, 13, 0, 0, ) + doc = tantivy.Document(name="Bill", date=date) + assert doc["date"][0] == date + + def test_document_repr(self): + doc = tantivy.Document(name="Bill", reference=[1, 2]) + assert repr(doc) == "Document(name=[Bill],reference=[1,2])" + + def test_document_with_facet(self): + doc = tantivy.Document() + facet = tantivy.Facet.from_string("/europe/france") + doc.add_facet("facet", facet) + assert doc["facet"][0].to_path() == ['europe', 'france'] + doc = tantivy.Document() + facet = tantivy.Facet.from_string("/asia\\/oceania/fiji") + doc.add_facet("facet", facet) + assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji'] + assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji" + assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)" + doc = tantivy.Document(facet=facet) + assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji'] + + def test_document_error(self): + with pytest.raises(ValueError): + tantivy.Document(name={}) \ No newline at end of file From 1ce4b4d51ee6493898fa3d8b31345640087da1ca Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 21 Aug 2019 07:19:00 +0900 Subject: [PATCH 2/5] Fixed tantivy rev --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 209807e..9875e48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ crate-type = ["cdylib"] [dependencies] chrono = "0.4" -tantivy = { git = "https://github.com/tantivy-search/tantivy" } +tantivy = { git = "https://github.com/tantivy-search/tantivy", rev = "135e0ea" } itertools = "0.8" [dependencies.pyo3] From 6dc2b943b44bc975af74c08bb21c8a66b66eed01 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 29 Aug 2019 09:36:08 +0900 Subject: [PATCH 3/5] CR comment, fixing some obsolete pydoc. --- src/document.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/document.rs b/src/document.rs index 8264cdb..84b6cce 100644 --- a/src/document.rs +++ b/src/document.rs @@ -220,7 +220,7 @@ impl Document { /// Add a text value to the document. /// /// Args: - /// field (Field): The field for which we are adding the text. + /// field_name (str): The field name for which we are adding the text. /// text (str): The text that will be added to the document. fn add_text(&mut self, field_name: String, text: &str) { add_value(self, field_name, text); @@ -229,7 +229,7 @@ impl Document { /// Add an unsigned integer value to the document. /// /// Args: - /// field (Field): The field for which we are adding the integer. + /// field_name (str): The field name for which we are adding the unsigned integer. /// value (int): The integer that will be added to the document. fn add_unsigned(&mut self, field_name: String, value: u64) { add_value(self, field_name, value); @@ -238,7 +238,7 @@ impl Document { /// Add a signed integer value to the document. /// /// Args: - /// field (Field): The field for which we are adding the integer. + /// field_name (str): The field name for which we are adding the integer. /// value (int): The integer that will be added to the document. fn add_integer(&mut self, field_name: String, value: i64) { add_value(self, field_name, value); @@ -247,7 +247,7 @@ impl Document { /// Add a date value to the document. /// /// Args: - /// field (Field): The field for which we are adding the integer. + /// field_name (str): The field name for which we are adding the date. /// value (datetime): The date that will be added to the document. fn add_date(&mut self, field_name: String, value: &PyDateTime) { let datetime = Utc @@ -267,7 +267,7 @@ impl Document { /// Add a facet value to the document. /// Args: - /// field (Field): The field for which we are adding the facet. + /// field_name (str): The field name for which we are adding the facet. /// value (Facet): The Facet that will be added to the document. fn add_facet(&mut self, field_name: String, facet: &Facet) { add_value(self, field_name, facet.inner.clone()); @@ -276,7 +276,7 @@ impl Document { /// Add a bytes value to the document. /// /// Args: - /// field (Field): The field for which we are adding the bytes. + /// field_name (str): The field for which we are adding the bytes. /// value (bytes): The bytes that will be added to the document. fn add_bytes(&mut self, field_name: String, bytes: Vec) { add_value(self, field_name, bytes); From ccd8021ff97f73e9adb1f43bbab8ec132b7389cf Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 29 Aug 2019 09:55:36 +0900 Subject: [PATCH 4/5] CR: Adding trailing lines and removing some trailing spaces. --- src/index.rs | 11 ++++- tantivy/__init__.py | 2 +- tests/tantivy_test.py | 102 +++++++++++++++++++++--------------------- 3 files changed, 61 insertions(+), 54 deletions(-) diff --git a/src/index.rs b/src/index.rs index 65f88ff..f02bdbf 100644 --- a/src/index.rs +++ b/src/index.rs @@ -144,7 +144,6 @@ impl Index { }; let reader = index.reader().map_err(to_pyerr)?; - println!("reader {}", reader.searcher().segment_readers().len()); obj.init(Index { index, reader }); Ok(()) } @@ -215,6 +214,16 @@ impl Index { Ok(()) } + /// Acquires a Searcher from the searcher pool. + /// + /// If no searcher is available during the call, note that + /// this call will block until one is made available. + /// + /// Searcher are automatically released back into the pool when + /// they are dropped. If you observe this function to block forever + /// you probably should configure the Index to have a larger + /// searcher pool, or you are holding references to previous searcher + /// for ever. fn searcher(&self) -> Searcher { Searcher { inner: self.reader.searcher(), diff --git a/tantivy/__init__.py b/tantivy/__init__.py index c7c118e..b462ae2 100644 --- a/tantivy/__init__.py +++ b/tantivy/__init__.py @@ -1 +1 @@ -from .tantivy import * \ No newline at end of file +from .tantivy import * diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index e9a1b35..870f172 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -3,57 +3,56 @@ import pytest from tantivy import Document, Index, SchemaBuilder, Schema + def schema(): - return SchemaBuilder()\ - .add_text_field("title", stored=True)\ - .add_text_field("body")\ + return SchemaBuilder() \ + .add_text_field("title", stored=True) \ + .add_text_field("body") \ .build() + @pytest.fixture(scope="class") def ram_index(): - # assume all tests will use the same documents for now - # other methods may set up function-local indexes - index = Index(schema()) - writer = index.writer() + # assume all tests will use the same documents for now + # other methods may set up function-local indexes + index = Index(schema()) + writer = index.writer() - # 2 ways of adding documents - # 1 - doc = Document() - # create a document instance - # add field-value pairs - doc.add_text("title", "The Old Man and the Sea") - doc.add_text("body", ("He was an old man who fished alone in a skiff in" - "the Gulf Stream and he had gone eighty-four days " - "now without taking a fish.")) - writer.add_document(doc) - # 2 use the built-in json support - # keys need to coincide with field names - doc = Document.from_dict({ - "title": "Of Mice and Men", - "body": ("A few miles south of Soledad, the Salinas River drops " - "in close to the hillside bank and runs deep and " - "green. The water is warm too, for it has slipped " - "twinkling over the yellow sands in the sunlight " - "before reaching the narrow pool. On one side of the " - "river the golden foothill slopes curve up to the " - "strong and rocky Gabilan Mountains, but on the valley " - "side the water is lined with trees—willows fresh and " - "green with every spring, carrying in their lower leaf " - "junctures the debris of the winter’s flooding; and " - "sycamores with mottled, white, recumbent limbs and " - "branches that arch over the pool") - }) - writer.add_document(doc) - - writer.add_json("""{ + # 2 ways of adding documents + # 1 + doc = Document() + # create a document instance + # add field-value pairs + doc.add_text("title", "The Old Man and the Sea") + doc.add_text("body", ("He was an old man who fished alone in a skiff in" + "the Gulf Stream and he had gone eighty-four days " + "now without taking a fish.")) + writer.add_document(doc) + # 2 use the built-in json support + # keys need to coincide with field names + doc = Document.from_dict({ + "title": "Of Mice and Men", + "body": ("A few miles south of Soledad, the Salinas River drops " + "in close to the hillside bank and runs deep and " + "green. The water is warm too, for it has slipped " + "twinkling over the yellow sands in the sunlight " + "before reaching the narrow pool. On one side of the " + "river the golden foothill slopes curve up to the " + "strong and rocky Gabilan Mountains, but on the valley " + "side the water is lined with trees—willows fresh and " + "green with every spring, carrying in their lower leaf " + "junctures the debris of the winter’s flooding; and " + "sycamores with mottled, white, recumbent limbs and " + "branches that arch over the pool") + }) + writer.add_document(doc) + writer.add_json("""{ "title": ["Frankenstein", "The Modern Prometheus"], "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." }""") - - writer.commit() - index.reload() - return index - + writer.commit() + index.reload() + return index class TestClass(object): @@ -86,7 +85,6 @@ class TestClass(object): assert len(result) == 1 - def test_and_query_parser_default_fields(self, ram_index): query = ram_index.parse_query("winter", default_field_names=["title"]) assert repr(query) == """Query(TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114])))""" @@ -98,7 +96,6 @@ class TestClass(object): "(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \ "})" - def test_query_errors(self, ram_index): index = ram_index # no "bod" field @@ -106,9 +103,9 @@ class TestClass(object): index.parse_query("bod:men", ["title", "body"]) - PATH_TO_INDEX = "tests/test_index/" + class TestFromDiskClass(object): def test_exists(self): @@ -123,29 +120,30 @@ class TestFromDiskClass(object): def test_create_readers(self): # not sure what is the point of this test. idx = Index(schema()) - assert idx.searcher().num_docs == 0 + assert idx.searcher().num_docs == 0 # by default this is manual mode writer = idx.writer(30000000, 1) - writer.add_document(Document(title="mytitle",body="mybody")) + writer.add_document(Document(title="mytitle", body="mybody")) writer.commit() - assert idx.searcher().num_docs == 0 + assert idx.searcher().num_docs == 0 # Manual is the default setting. # In this case, change are reflected only when # the index is manually reloaded. idx.reload() - assert idx.searcher().num_docs == 1 + assert idx.searcher().num_docs == 1 idx.config_reader("OnCommit", 4) - writer.add_document(Document(title="mytitle2",body="mybody2")) + writer.add_document(Document(title="mytitle2", body="mybody2")) writer.commit() import time for i in range(50): # The index should be automatically reloaded. # Wait for at most 5s for it to happen. time.sleep(0.1) - if idx.searcher().num_docs == 2: + if idx.searcher().num_docs == 2: return assert False + class TestSearcher(object): def test_searcher_repr(self, ram_index): assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)" @@ -187,4 +185,4 @@ class TestDocument(object): def test_document_error(self): with pytest.raises(ValueError): - tantivy.Document(name={}) \ No newline at end of file + tantivy.Document(name={}) From c91234cd73c9dcb804fd025b5a002536ef552eac Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 2 Sep 2019 09:39:18 +0900 Subject: [PATCH 5/5] Depending on tantivy@master --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 9875e48..a0b686b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ crate-type = ["cdylib"] [dependencies] chrono = "0.4" -tantivy = { git = "https://github.com/tantivy-search/tantivy", rev = "135e0ea" } +tantivy = { git = "https://github.com/tantivy-search/tantivy", branch = "master" } itertools = "0.8" [dependencies.pyo3]