Merge pull request #1 from matrix-org/initial-bindings

Initial python bindings implementation.
master
Paul Masurel 2019-08-01 16:05:15 +09:00 committed by GitHub
commit 9158a4fd7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 1443 additions and 10 deletions

15
.gitignore vendored
View File

@ -1,10 +1,7 @@
# Generated by Cargo
# will have compiled files and executables
/target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
/target
**/*.rs.bk
Cargo.lock
dist/
__pycache__/
tantivy.so
tantivy.egg-info/

18
Cargo.toml Normal file
View File

@ -0,0 +1,18 @@
[package]
name = "tantivy-py"
version = "0.1.0"
authors = ["Damir Jelić <poljar@termina.org.uk>"]
edition = "2018"
license = "MIT"
[lib]
name = "tantivy"
crate-type = ["dylib"]
[dependencies]
chrono = "0.4"
tantivy = { git = "https://github.com/tantivy-search/tantivy" }
[dependencies.pyo3]
version = "0.7.0"
features = ["extension-module"]

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2019 tantivy
Copyright (c) 2019 The Matrix.org Foundation CIC
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

4
MANIFEST.in Normal file
View File

@ -0,0 +1,4 @@
include Cargo.toml
include Makefile
include rust-toolchain
recursive-include src *

17
Makefile Normal file
View File

@ -0,0 +1,17 @@
source_files := $(wildcard src/*.rs)
all: tantivy/tantivy.so
PHONY: test format
test: tantivy/tantivy.so
python3 -m pytest
format:
rustfmt src/*.rs
tantivy/tantivy.so: target/debug/libtantivy.so
cp target/debug/libtantivy.so tantivy/tantivy.so
target/debug/libtantivy.so: $(source_files)
cargo build

54
README.md Normal file
View File

@ -0,0 +1,54 @@
tantivy-py
==========
Python bindings for tantivy.
# Installation
The bindings can be installed using setuptools:
python3 setup.py install --user
Note that this requires setuptools-rust to be installed. Another thing to note
is that the bindings are using [PyO3](https://github.com/PyO3/pyo3), which
requires rust nightly and only supports python3.
# Usage
tantivy-py has a similar API to tantivy. To create a index first a schema
needs to be built. After that documents can be added to the index and a reader
can be created to search the index.
```python
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
body = builder.add_text_field("body")
schema = builder.build()
index = tantivy.Index(schema)
writer = index.writer()
doc = tantivy.Document()
doc.add_text(title, "The Old Man and the Sea")
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."))
writer.add_document(doc)
reader = index.reader()
searcher = reader.searcher()
query_parser = tantivy.QueryParser.for_index(index, [title, body])
query = query_parser.parse_query("sea whale")
top_docs = tantivy.TopDocs(10)
result = searcher.search(query, top_docs)
_, doc_address = result[0]
searched_doc = searcher.doc(doc_address)
assert searched_doc.get_first(title) == "The Old Man and the Sea"
```

2
pyproject.toml Normal file
View File

@ -0,0 +1,2 @@
[build-system]
requires = ["setuptools", "wheel", "setuptools-rust"]

1
rust-toolchain Normal file
View File

@ -0,0 +1 @@
nightly-2019-05-22

1
rustfmt.toml Normal file
View File

@ -0,0 +1 @@
max_width = 80

15
setup.py Normal file
View File

@ -0,0 +1,15 @@
from setuptools import setup
try:
from setuptools_rust import Binding, RustExtension
except ImportError:
print("Please install setuptools-rust package")
raise SystemExit(1)
setup(
name="tantivy",
version="0.9.1",
rust_extensions=[RustExtension("tantivy.tantivy", binding=Binding.PyO3)],
packages=["tantivy"],
zip_safe=False,
)

152
src/document.rs Normal file
View File

@ -0,0 +1,152 @@
use pyo3::prelude::*;
use pyo3::types::PyDateTime;
use pyo3::types::{PyDateAccess, PyTimeAccess};
use chrono::offset::TimeZone;
use chrono::Utc;
use tantivy as tv;
use crate::facet::Facet;
use crate::field::{Field, FieldValue};
/// Tantivy's Document is the object that can be indexed and then searched for.
///
/// Documents are fundamentally a collection of unordered tuples
/// (field, value). In this list, one field may appear more than once.
///
/// Example:
/// >>> doc = tantivy.Document()
/// >>> doc.add_text(title, "The Old Man and the Sea")
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
/// "skiff in the Gulf Stream and he had gone "
/// "eighty-four days now without taking a fish."))
#[pyclass]
pub(crate) struct Document {
pub(crate) inner: tv::Document,
}
#[pymethods]
impl Document {
#[new]
fn new(obj: &PyRawObject) {
obj.init(Document {
inner: tv::Document::default(),
});
}
/// Add a text value to the document.
///
/// Args:
/// field (Field): The field for which we are adding the text.
/// text (str): The text that will be added to the document.
fn add_text(&mut self, field: &Field, text: &str) {
self.inner.add_text(field.inner, text);
}
/// Add an unsigned integer value to the document.
///
/// Args:
/// field (Field): The field for which we are adding the integer.
/// value (int): The integer that will be added to the document.
fn add_unsigned(&mut self, field: &Field, value: u64) {
self.inner.add_u64(field.inner, value);
}
/// Add a signed integer value to the document.
///
/// Args:
/// field (Field): The field for which we are adding the integer.
/// value (int): The integer that will be added to the document.
fn add_integer(&mut self, field: &Field, value: i64) {
self.inner.add_i64(field.inner, value);
}
/// Add a date value to the document.
///
/// Args:
/// field (Field): The field for which we are adding the integer.
/// value (datetime): The date that will be added to the document.
fn add_date(&mut self, field: &Field, value: &PyDateTime) {
let datetime = Utc
.ymd(
value.get_year().into(),
value.get_month().into(),
value.get_day().into(),
)
.and_hms_micro(
value.get_hour().into(),
value.get_minute().into(),
value.get_second().into(),
value.get_microsecond().into(),
);
self.inner.add_date(field.inner, &datetime);
}
/// Add a facet value to the document.
/// Args:
/// field (Field): The field for which we are adding the facet.
/// value (Facet): The Facet that will be added to the document.
fn add_facet(&mut self, field: &Field, value: &Facet) {
self.inner.add_facet(field.inner, value.inner.clone());
}
/// Add a bytes value to the document.
///
/// Args:
/// field (Field): The field for which we are adding the bytes.
/// value (bytes): The bytes that will be added to the document.
fn add_bytes(&mut self, field: &Field, value: Vec<u8>) {
self.inner.add_bytes(field.inner, value);
}
/// Returns the number of added fields that have been added to the document
#[getter]
fn len(&self) -> usize {
self.inner.len()
}
/// True if the document is empty, False otherwise.
#[getter]
fn is_empty(&self) -> bool {
self.inner.is_empty()
}
/// Get the first value associated with the given field.
///
/// Args:
/// field (Field): The field for which we would like to get the value.
///
/// Returns the value if one is found, otherwise None.
/// The type of the value depends on the field.
fn get_first(&self, py: Python, field: &Field) -> Option<PyObject> {
let value = self.inner.get_first(field.inner)?;
FieldValue::value_to_py(py, value)
}
/// Get the all values associated with the given field.
///
/// Args:
/// field (Field): The field for which we would like to get the values.
///
/// Returns a list of values.
/// The type of the value depends on the field.
fn get_all(&self, py: Python, field: &Field) -> Vec<PyObject> {
let values = self.inner.get_all(field.inner);
values
.iter()
.map(|&v| FieldValue::value_to_py(py, v))
.filter_map(|x| x)
.collect()
}
/// Get all the fields and values contained in the document.
fn field_values(&self, py: Python) -> Vec<FieldValue> {
let field_values = self.inner.field_values();
field_values
.iter()
.map(|v| FieldValue::field_value_to_py(py, v))
.collect()
}
}

55
src/facet.rs Normal file
View File

@ -0,0 +1,55 @@
use pyo3::prelude::*;
use pyo3::types::PyType;
use tantivy::schema;
/// A Facet represent a point in a given hierarchy.
///
/// They are typically represented similarly to a filepath. For instance, an
/// e-commerce website could have a Facet for /electronics/tv_and_video/led_tv.
///
/// A document can be associated to any number of facets. The hierarchy
/// implicitely imply that a document belonging to a facet also belongs to the
/// ancestor of its facet. In the example above, /electronics/tv_and_video/
/// and /electronics.
#[pyclass]
pub(crate) struct Facet {
pub(crate) inner: schema::Facet,
}
#[pymethods]
impl Facet {
/// Create a new instance of the "root facet" Equivalent to /.
#[classmethod]
fn root(_cls: &PyType) -> Facet {
Facet {
inner: schema::Facet::root(),
}
}
/// Returns true if the facet is the root facet /.
#[getter]
fn is_root(&self) -> bool {
self.inner.is_root()
}
/// Returns true if another Facet is a subfacet of this facet.
/// Args:
/// other (Facet): The Facet that we should check if this facet is a
/// subset of.
fn is_prefix_of(&self, other: &Facet) -> bool {
self.inner.is_prefix_of(&other.inner)
}
/// Create a Facet object from a string.
/// Args:
/// facet_string (str): The string that contains a facet.
///
/// Returns the created Facet.
#[classmethod]
fn from_string(_cls: &PyType, facet_string: &str) -> Facet {
Facet {
inner: schema::Facet::from_text(facet_string),
}
}
}

72
src/field.rs Normal file
View File

@ -0,0 +1,72 @@
use pyo3::prelude::*;
use pyo3::types::PyDateTime;
use tantivy::schema;
use crate::facet::Facet;
/// Field is a numeric indentifier that represents an entry in the Schema.
#[pyclass]
#[derive(Clone)]
pub(crate) struct Field {
pub(crate) inner: schema::Field,
}
/// FieldValue holds together a Field and its Value.
#[pyclass]
pub(crate) struct FieldValue {
pub(crate) field: Field,
pub(crate) value: PyObject,
}
#[pymethods]
impl FieldValue {
#[getter]
fn field(&self) -> Field {
self.field.clone()
}
#[getter]
fn value(&self) -> &PyObject {
&self.value
}
}
impl FieldValue {
pub(crate) fn value_to_py(
py: Python,
value: &schema::Value,
) -> Option<PyObject> {
match value {
schema::Value::Str(text) => Some(text.into_object(py)),
schema::Value::U64(num) => Some(num.into_object(py)),
schema::Value::I64(num) => Some(num.into_object(py)),
schema::Value::Bytes(b) => Some(b.to_object(py)),
schema::Value::Date(d) => {
let date =
PyDateTime::from_timestamp(py, d.timestamp() as f64, None);
match date {
Ok(d) => Some(d.into_object(py)),
Err(_e) => None,
}
}
schema::Value::Facet(f) => {
Some(Facet { inner: f.clone() }.into_object(py))
}
}
}
pub(crate) fn field_value_to_py(
py: Python,
field_value: &schema::FieldValue,
) -> FieldValue {
let value = field_value.value();
let field = field_value.field();
FieldValue {
field: Field { inner: field },
value: FieldValue::value_to_py(py, value).unwrap(),
}
}
}

284
src/index.rs Normal file
View File

@ -0,0 +1,284 @@
use pyo3::exceptions;
use pyo3::prelude::*;
use crate::document::Document;
use crate::schema::Schema;
use crate::searcher::Searcher;
use tantivy as tv;
use tantivy::directory::MmapDirectory;
const RELOAD_POLICY: &str = "commit";
/// IndexReader is the entry point to read and search the index.
///
/// IndexReader controls when a new version of the index should be loaded and
/// lends you instances of Searcher for the last loaded version.
///
/// To create an IndexReader first create an Index and call the reader() method
/// on the index object.
#[pyclass]
pub(crate) struct IndexReader {
inner: tv::IndexReader,
}
#[pymethods]
impl IndexReader {
/// Update searchers so that they reflect the state of the last .commit().
///
/// If you set up the the reload policy to be on 'commit' (which is the
/// default) every commit should be rapidly reflected on your IndexReader
/// and you should not need to call reload() at all.
fn reload(&self) -> PyResult<()> {
let ret = self.inner.reload();
match ret {
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Get a Searcher for the index.
///
/// This method should be called every single time a search query is
/// performed. The searchers are taken from a pool of num_searchers
/// searchers.
///
/// Returns a Searcher object, if no searcher is available this may block.
fn searcher(&self) -> Searcher {
let searcher = self.inner.searcher();
Searcher { inner: searcher }
}
}
/// IndexWriter is the user entry-point to add documents to the index.
///
/// To create an IndexWriter first create an Index and call the writer() method
/// on the index object.
#[pyclass]
pub(crate) struct IndexWriter {
inner: tv::IndexWriter,
}
#[pymethods]
impl IndexWriter {
/// Add a document to the index.
///
/// If the indexing pipeline is full, this call may block.
///
/// Returns an `opstamp`, which is an increasing integer that can be used
/// by the client to align commits with its own document queue.
/// The `opstamp` represents the number of documents that have been added
/// since the creation of the index.
fn add_document(&mut self, document: &Document) -> PyResult<()> {
self.inner.add_document(document.inner.clone());
Ok(())
}
/// Commits all of the pending changes
///
/// A call to commit blocks. After it returns, all of the document that
/// were added since the last commit are published and persisted.
///
/// In case of a crash or an hardware failure (as long as the hard disk is
/// spared), it will be possible to resume indexing from this point.
///
/// Returns the `opstamp` of the last document that made it in the commit.
fn commit(&mut self) -> PyResult<()> {
let ret = self.inner.commit();
match ret {
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Rollback to the last commit
///
/// This cancels all of the update that happened before after the last
/// commit. After calling rollback, the index is in the same state as it
/// was after the last commit.
fn rollback(&mut self) -> PyResult<()> {
let ret = self.inner.rollback();
match ret {
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Detect and removes the files that are not used by the index anymore.
fn garbage_collect_files(&mut self) -> PyResult<()> {
let ret = self.inner.garbage_collect_files();
match ret {
Ok(_) => Ok(()),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// The opstamp of the last successful commit.
///
/// This is the opstamp the index will rollback to if there is a failure
/// like a power surge.
///
/// This is also the opstamp of the commit that is currently available
/// for searchers.
#[getter]
fn commit_opstamp(&self) -> u64 {
self.inner.commit_opstamp()
}
}
/// Create a new index object.
///
/// Args:
/// schema (Schema): The schema of the index.
/// path (str, optional): The path where the index should be stored. If
/// no path is provided, the index will be stored in memory.
/// reuse (bool, optional): Should we open an existing index if one exists
/// or always create a new one.
///
/// If an index already exists it will be opened and reused. Raises OSError
/// if there was a problem during the opening or creation of the index.
#[pyclass]
pub(crate) struct Index {
pub(crate) inner: tv::Index,
}
#[pymethods]
impl Index {
#[new]
#[args(reuse = true)]
fn new(
obj: &PyRawObject,
schema: &Schema,
path: Option<&str>,
reuse: bool,
) -> PyResult<()> {
let index = match path {
Some(p) => {
let directory = MmapDirectory::open(p);
let dir = match directory {
Ok(d) => d,
Err(e) => {
return Err(exceptions::OSError::py_err(e.to_string()))
}
};
let i = if reuse {
tv::Index::open_or_create(dir, schema.inner.clone())
} else {
tv::Index::create(dir, schema.inner.clone())
};
match i {
Ok(index) => index,
Err(e) => {
return Err(exceptions::OSError::py_err(e.to_string()))
}
}
}
None => tv::Index::create_in_ram(schema.inner.clone()),
};
obj.init(Index { inner: index });
Ok(())
}
/// Create a `IndexWriter` for the index.
///
/// The writer will be multithreaded and the provided heap size will be
/// split between the given number of threads.
///
/// Args:
/// overall_heap_size (int, optional): The total target memory usage of
/// the writer, can't be less than 3000000.
/// num_threads (int, optional): The number of threads that the writer
/// should use. If this value is 0, tantivy will choose
/// automatically the number of threads.
///
/// Raises ValueError if there was an error while creating the writer.
#[args(heap_size = 3000000, num_threads = 0)]
fn writer(
&self,
heap_size: usize,
num_threads: usize,
) -> PyResult<IndexWriter> {
let writer = match num_threads {
0 => self.inner.writer(heap_size),
_ => self.inner.writer_with_num_threads(num_threads, heap_size),
};
match writer {
Ok(w) => Ok(IndexWriter { inner: w }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Create an IndexReader for the index.
///
/// Args:
/// reload_policy (str, optional): The reload policy that the
/// IndexReader should use. Can be manual or OnCommit.
/// num_searchers (int, optional): The number of searchers that the
/// reader should create.
///
/// Returns the IndexReader on success, raises ValueError if a IndexReader
/// couldn't be created.
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
fn reader(
&self,
reload_policy: &str,
num_searchers: usize,
) -> PyResult<IndexReader> {
let reload_policy = reload_policy.to_lowercase();
let reload_policy = match reload_policy.as_ref() {
"commit" => tv::ReloadPolicy::OnCommit,
"on-commit" => tv::ReloadPolicy::OnCommit,
"oncommit" => tv::ReloadPolicy::OnCommit,
"manual" => tv::ReloadPolicy::Manual,
_ => return Err(exceptions::ValueError::py_err(
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
))
};
let builder = self.inner.reader_builder();
let builder = builder.reload_policy(reload_policy);
let builder = if num_searchers > 0 {
builder.num_searchers(num_searchers)
} else {
builder
};
let reader = builder.try_into();
match reader {
Ok(r) => Ok(IndexReader { inner: r }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Check if the given path contains an existing index.
/// Args:
/// path: The path where tantivy will search for an index.
///
/// Returns True if an index exists at the given path, False otherwise.
///
/// Raises OSError if the directory cannot be opened.
#[staticmethod]
fn exists(path: &str) -> PyResult<bool> {
let directory = MmapDirectory::open(path);
let dir = match directory {
Ok(d) => d,
Err(e) => return Err(exceptions::OSError::py_err(e.to_string())),
};
Ok(tv::Index::exists(&dir))
}
/// The schema of the current index.
#[getter]
fn schema(&self) -> Schema {
let schema = self.inner.schema();
Schema { inner: schema }
}
}

88
src/lib.rs Normal file
View File

@ -0,0 +1,88 @@
use pyo3::prelude::*;
mod document;
mod facet;
mod field;
mod index;
mod query;
mod schema;
mod schemabuilder;
mod searcher;
use document::Document;
use facet::Facet;
use field::{Field, FieldValue};
use index::Index;
use query::QueryParser;
use schema::Schema;
use schemabuilder::SchemaBuilder;
use searcher::{DocAddress, Searcher, TopDocs};
/// Python bindings for the search engine library Tantivy.
///
/// Tantivy is a full text search engine library written in rust.
///
/// It is closer to Apache Lucene than to Elasticsearch and Apache Solr in
/// the sense it is not an off-the-shelf search engine server, but rather
/// a library that can be used to build such a search engine.
/// Tantivy is, in fact, strongly inspired by Lucene's design.
///
/// Example:
/// >>> import json
/// >>> import tantivy
///
/// >>> builder = tantivy.SchemaBuilder()
///
/// >>> title = builder.add_text_field("title", stored=True)
/// >>> body = builder.add_text_field("body")
///
/// >>> schema = builder.build()
/// >>> index = tantivy.Index(schema)
/// >>> doc = tantivy.Document()
/// >>> doc.add_text(title, "The Old Man and the Sea")
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
/// "skiff in the Gulf Stream and he had gone "
/// "eighty-four days now without taking a fish."))
///
/// >>> writer.add_document(doc)
///
/// >>> doc = schema.parse_document(json.dumps({
/// "title": ["Frankenstein", "The Modern Prometheus"],
/// "body": ("You will rejoice to hear that no disaster has "
/// "accompanied the commencement of an enterprise which "
/// "you have regarded with such evil forebodings. "
/// "I arrived here yesterday, and my first task is to "
/// "assure my dear sister of my welfare and increasing "
/// "confidence in the success of my undertaking.")
/// }))
///
/// >>> writer.add_document(doc)
/// >>> writer.commit()
///
/// >>> reader = index.reader()
/// >>> searcher = reader.searcher()
///
/// >>> query_parser = tantivy.QueryParser.for_index(index, [title, body])
/// >>> query = query_parser.parse_query("sea whale")
///
/// >>> top_docs = tantivy.TopDocs.with_limit(10)
/// >>> result = searcher.search(query, top_docs)
///
/// >>> assert len(result) == 1
///
#[pymodule]
fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Schema>()?;
m.add_class::<SchemaBuilder>()?;
m.add_class::<Searcher>()?;
m.add_class::<Index>()?;
m.add_class::<QueryParser>()?;
m.add_class::<Document>()?;
m.add_class::<DocAddress>()?;
m.add_class::<TopDocs>()?;
m.add_class::<Field>()?;
m.add_class::<FieldValue>()?;
m.add_class::<Facet>()?;
Ok(())
}

70
src/query.rs Normal file
View File

@ -0,0 +1,70 @@
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::PyType;
use tantivy as tv;
use crate::field::Field;
use crate::index::Index;
/// Tantivy's Query
#[pyclass]
pub(crate) struct Query {
pub(crate) inner: Box<dyn tv::query::Query>,
}
/// Tantivy's Query parser
#[pyclass]
pub(crate) struct QueryParser {
inner: tv::query::QueryParser,
}
#[pymethods]
impl QueryParser {
/// Creates a QueryParser for an Index.
///
/// Args:
/// index (Index): The index for which the query will be created.
/// default_fields (List[Field]): A list of fields used to search if no
/// field is specified in the query.
///
/// Returns the QueryParser.
#[classmethod]
fn for_index(
_cls: &PyType,
index: &Index,
default_fields: Vec<&Field>,
) -> PyResult<QueryParser> {
let default_fields: Vec<tv::schema::Field> =
default_fields.iter().map(|&f| f.inner.clone()).collect();
let parser =
tv::query::QueryParser::for_index(&index.inner, default_fields);
Ok(QueryParser { inner: parser })
}
/// Parse a string into a query that can be given to a searcher.
///
/// Args:
/// query (str): A query string that should be parsed into a query.
///
/// Returns the parsed Query object. Raises ValueError if there was an
/// error with the query string.
fn parse_query(&self, query: &str) -> PyResult<Query> {
let ret = self.inner.parse_query(query);
match ret {
Ok(q) => Ok(Query { inner: q }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Set the default way to compose queries to a conjunction.
///
/// By default, the query happy tax payer is equivalent to the query happy
/// OR tax OR payer. After calling .set_conjunction_by_default() happy tax
/// payer will be interpreted by the parser as happy AND tax AND payer.
fn set_conjunction_by_default(&mut self) {
self.inner.set_conjunction_by_default();
}
}

65
src/schema.rs Normal file
View File

@ -0,0 +1,65 @@
use pyo3::exceptions;
use pyo3::prelude::*;
use tantivy::schema;
use crate::document::Document;
use crate::field::Field;
/// Tantivy schema.
///
/// The schema is very strict. To build the schema the `SchemaBuilder` class is
/// provided.
#[pyclass]
pub(crate) struct Schema {
pub(crate) inner: schema::Schema,
}
#[pymethods]
impl Schema {
/// Build a document object from a json string.
///
/// Args:
/// doc_json (str) - A string containing json that should be parsed
/// into a `Document`
///
/// Returns the parsed document, raises a ValueError if the parsing failed.
fn parse_document(&self, doc_json: &str) -> PyResult<Document> {
let ret = self.inner.parse_document(doc_json);
match ret {
Ok(d) => Ok(Document { inner: d }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Convert a `Document` object into a json string.
///
/// Args:
/// doc (Document): The document that will be converted into a json
/// string.
fn to_json(&self, doc: &Document) -> String {
self.inner.to_json(&doc.inner)
}
/// Return the field name for a given `Field`.
///
/// Args:
/// field (Field): The field for which the name will be returned.
fn get_field_name(&self, field: &Field) -> &str {
self.inner.get_field_name(field.inner)
}
/// Returns the field option associated with a given name.
///
/// Args:
/// name (str): The name of the field that we want to retrieve.
///
/// Returns the Field if one is found, None otherwise.
fn get_field(&self, name: &str) -> Option<Field> {
let f = self.inner.get_field(name);
match f {
Some(field) => Some(Field { inner: field }),
None => None,
}
}
}

318
src/schemabuilder.rs Normal file
View File

@ -0,0 +1,318 @@
use pyo3::exceptions;
use pyo3::prelude::*;
use tantivy::schema;
use crate::field::Field;
use crate::schema::Schema;
/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
/// stored or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
///
/// Examples:
///
/// >>> builder = tantivy.SchemaBuilder()
///
/// >>> title = builder.add_text_field("title", stored=True)
/// >>> body = builder.add_text_field("body")
///
/// >>> schema = builder.build()
#[pyclass]
pub(crate) struct SchemaBuilder {
pub(crate) builder: Option<schema::SchemaBuilder>,
}
const TOKENIZER: &str = "default";
const RECORD: &str = "position";
#[pymethods]
impl SchemaBuilder {
#[new]
fn new(obj: &PyRawObject) {
obj.init(SchemaBuilder {
builder: Some(schema::Schema::builder()),
});
}
/// Add a new text field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// tokenizer_name (str, optional): The name of the tokenizer that
/// should be used to process the field. Defaults to 'default'
/// index_option (str, optional): Sets which information should be
/// indexed with the tokens. Can be one of 'position', 'freq' or
/// 'basic'. Defaults to 'position'. The 'basic' index_option
/// records only the document ID, the 'freq' option records the
/// document id and the term frequency, while the 'position' option
/// records the document id, term frequency and the positions of
/// the term occurrences in the document.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(
stored = false,
tokenizer_name = "TOKENIZER",
index_option = "RECORD"
)]
fn add_text_field(
&mut self,
name: &str,
stored: bool,
tokenizer_name: &str,
index_option: &str,
) -> PyResult<Field> {
let builder = &mut self.builder;
let index_option = match index_option {
"position" => schema::IndexRecordOption::WithFreqsAndPositions,
"freq" => schema::IndexRecordOption::WithFreqs,
"basic" => schema::IndexRecordOption::Basic,
_ => return Err(exceptions::ValueError::py_err(
"Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
))
};
let indexing = schema::TextFieldIndexing::default()
.set_tokenizer(tokenizer_name)
.set_index_option(index_option);
let options =
schema::TextOptions::default().set_indexing_options(indexing);
let options = if stored {
options.set_stored()
} else {
options
};
if let Some(builder) = builder {
let field = builder.add_text_field(name, options);
Ok(Field { inner: field })
} else {
Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.",
))
}
}
/// Add a new signed integer field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the u64 options as a single-valued fast
/// field. Fast fields are designed for random access. Access time
/// are similar to a random lookup in an array. If more than one
/// value is associated to a fast field, only the last one is kept.
/// Can be one of 'single' or 'multi'. If this is set to 'single,
/// the document must have exactly one value associated to the
/// document. If this is set to 'multi', the document can have any
/// number of values associated to the document. Defaults to None,
/// which disables this option.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)]
fn add_integer_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: Option<&str>,
) -> PyResult<Field> {
let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
if let Some(builder) = builder {
let field = builder.add_i64_field(name, opts);
Ok(Field { inner: field })
} else {
Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.",
))
}
}
/// Add a new unsigned integer field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the u64 options as a single-valued fast
/// field. Fast fields are designed for random access. Access time
/// are similar to a random lookup in an array. If more than one
/// value is associated to a fast field, only the last one is kept.
/// Can be one of 'single' or 'multi'. If this is set to 'single,
/// the document must have exactly one value associated to the
/// document. If this is set to 'multi', the document can have any
/// number of values associated to the document. Defaults to None,
/// which disables this option.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)]
fn add_unsigned_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: Option<&str>,
) -> PyResult<Field> {
let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
if let Some(builder) = builder {
let field = builder.add_u64_field(name, opts);
Ok(Field { inner: field })
} else {
Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.",
))
}
}
/// Add a new date field to the schema.
///
/// Args:
/// name (str): The name of the field.
/// stored (bool, optional): If true sets the field as stored, the
/// content of the field can be later restored from a Searcher.
/// Defaults to False.
/// indexed (bool, optional): If true sets the field to be indexed.
/// fast (str, optional): Set the u64 options as a single-valued fast
/// field. Fast fields are designed for random access. Access time
/// are similar to a random lookup in an array. If more than one
/// value is associated to a fast field, only the last one is kept.
/// Can be one of 'single' or 'multi'. If this is set to 'single,
/// the document must have exactly one value associated to the
/// document. If this is set to 'multi', the document can have any
/// number of values associated to the document. Defaults to None,
/// which disables this option.
///
/// Returns the associated field handle.
/// Raises a ValueError if there was an error with the field creation.
#[args(stored = false, indexed = false)]
fn add_date_field(
&mut self,
name: &str,
stored: bool,
indexed: bool,
fast: Option<&str>,
) -> PyResult<Field> {
let builder = &mut self.builder;
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
if let Some(builder) = builder {
let field = builder.add_date_field(name, opts);
Ok(Field { inner: field })
} else {
Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.",
))
}
}
/// Add a Facet field to the schema.
/// Args:
/// name (str): The name of the field.
fn add_facet_field(&mut self, name: &str) -> PyResult<Field> {
let builder = &mut self.builder;
if let Some(builder) = builder {
let field = builder.add_facet_field(name);
Ok(Field { inner: field })
} else {
Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.",
))
}
}
/// Add a fast bytes field to the schema.
///
/// Bytes field are not searchable and are only used
/// as fast field, to associate any kind of payload
/// to a document.
///
/// Args:
/// name (str): The name of the field.
fn add_bytes_field(&mut self, name: &str) -> PyResult<Field> {
let builder = &mut self.builder;
if let Some(builder) = builder {
let field = builder.add_bytes_field(name);
Ok(Field { inner: field })
} else {
Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.",
))
}
}
/// Finalize the creation of a Schema.
///
/// Returns a Schema object. After this is called the SchemaBuilder cannot
/// be used anymore.
fn build(&mut self) -> PyResult<Schema> {
let builder = self.builder.take();
if let Some(builder) = builder {
let schema = builder.build();
Ok(Schema { inner: schema })
} else {
Err(exceptions::ValueError::py_err(
"Schema builder object isn't valid anymore.",
))
}
}
}
impl SchemaBuilder {
fn build_int_option(
stored: bool,
indexed: bool,
fast: Option<&str>,
) -> PyResult<schema::IntOptions> {
let opts = schema::IntOptions::default();
let opts = if stored { opts.set_stored() } else { opts };
let opts = if indexed { opts.set_indexed() } else { opts };
let fast = match fast {
Some(f) => {
let f = f.to_lowercase();
match f.as_ref() {
"single" => Some(schema::Cardinality::SingleValue),
"multi" => Some(schema::Cardinality::MultiValues),
_ => return Err(exceptions::ValueError::py_err(
"Invalid index option, valid choices are: 'multivalue' and 'singlevalue'"
)),
}
}
None => None,
};
let opts = if let Some(f) = fast {
opts.set_fast(f)
} else {
opts
};
Ok(opts)
}
}

134
src/searcher.rs Normal file
View File

@ -0,0 +1,134 @@
use pyo3::exceptions;
use pyo3::prelude::*;
use tantivy as tv;
use crate::document::Document;
use crate::query::Query;
/// Tantivy's Searcher class
///
/// A Searcher is used to search the index given a prepared Query.
#[pyclass]
pub(crate) struct Searcher {
pub(crate) inner: tv::LeasedItem<tv::Searcher>,
}
#[pymethods]
impl Searcher {
/// Search the index with the given query and collect results.
///
/// Args:
/// query (Query): The query that will be used for the search.
/// collector (Collector): A collector that determines how the search
/// results will be collected. Only the TopDocs collector is
/// supported for now.
///
/// Returns a list of tuples that contains the scores and DocAddress of the
/// search results.
///
/// Raises a ValueError if there was an error with the search.
fn search(
&self,
query: &Query,
collector: &mut TopDocs,
) -> PyResult<Vec<(f32, DocAddress)>> {
let ret = self.inner.search(&query.inner, &collector.inner);
match ret {
Ok(r) => {
let result: Vec<(f32, DocAddress)> = r
.iter()
.map(|(f, d)| (f.clone(), DocAddress::from(d)))
.collect();
Ok(result)
}
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
/// Returns the overall number of documents in the index.
#[getter]
fn num_docs(&self) -> u64 {
self.inner.num_docs()
}
/// Fetches a document from Tantivy's store given a DocAddress.
///
/// Args:
/// doc_address (DocAddress): The DocAddress that is associated with
/// the document that we wish to fetch.
///
/// Returns the Document, raises ValueError if the document can't be found.
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
let ret = self.inner.doc(doc_address.into());
match ret {
Ok(doc) => Ok(Document { inner: doc }),
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
}
}
}
/// DocAddress contains all the necessary information to identify a document
/// given a Searcher object.
///
/// It consists in an id identifying its segment, and its segment-local DocId.
/// The id used for the segment is actually an ordinal in the list of segment
/// hold by a Searcher.
#[pyclass]
pub(crate) struct DocAddress {
pub(crate) segment_ord: tv::SegmentLocalId,
pub(crate) doc: tv::DocId,
}
#[pymethods]
impl DocAddress {
/// The segment ordinal is an id identifying the segment hosting the
/// document. It is only meaningful, in the context of a searcher.
#[getter]
fn segment_ord(&self) -> u32 {
self.segment_ord
}
/// The segment local DocId
#[getter]
fn doc(&self) -> u32 {
self.doc
}
}
impl From<&tv::DocAddress> for DocAddress {
fn from(doc_address: &tv::DocAddress) -> Self {
DocAddress {
segment_ord: doc_address.segment_ord(),
doc: doc_address.doc(),
}
}
}
impl Into<tv::DocAddress> for &DocAddress {
fn into(self) -> tv::DocAddress {
tv::DocAddress(self.segment_ord(), self.doc())
}
}
/// The Top Score Collector keeps track of the K documents sorted by their
/// score.
///
/// Args:
/// limit (int, optional): The number of documents that the top scorer will
/// retrieve. Must be a positive integer larger than 0. Defaults to 10.
#[pyclass]
pub(crate) struct TopDocs {
inner: tv::collector::TopDocs,
}
#[pymethods]
impl TopDocs {
#[new]
#[args(limit = 10)]
fn new(obj: &PyRawObject, limit: usize) -> PyResult<()> {
let top = tv::collector::TopDocs::with_limit(limit);
obj.init(TopDocs { inner: top });
Ok(())
}
}

1
tantivy/__init__.py Normal file
View File

@ -0,0 +1 @@
from .tantivy import *

85
tests/tantivy_test.py Normal file
View File

@ -0,0 +1,85 @@
import json
import tantivy
class TestClass(object):
def test_simple_search(self):
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
body = builder.add_text_field("body")
schema = builder.build()
index = tantivy.Index(schema)
writer = index.writer()
doc = tantivy.Document()
doc.add_text(title, "The Old Man and the Sea")
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."))
writer.add_document(doc)
doc = schema.parse_document(json.dumps({
"title": "Of Mice and Men",
"body": ("A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool")
}))
writer.add_document(doc)
doc = schema.parse_document(json.dumps({
"title": ["Frankenstein", "The Modern Prometheus"],
"body": ("You will rejoice to hear that no disaster has "
"accompanied the commencement of an enterprise which you "
"have regarded with such evil forebodings. I arrived "
"here yesterday, and my first task is to assure my dear "
"sister of my welfare and increasing confidence in the "
"success of my undertaking.")
}))
writer.add_document(doc)
writer.commit()
reader = index.reader()
searcher = reader.searcher()
query_parser = tantivy.QueryParser.for_index(index, [title, body])
query = query_parser.parse_query("sea whale")
top_docs = tantivy.TopDocs(10)
result = searcher.search(query, top_docs)
print(result)
assert len(result) == 1
_, doc_address = result[0]
searched_doc = searcher.doc(doc_address)
assert searched_doc.get_first(title) == "The Old Man and the Sea"
def test_doc(self):
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
doc = tantivy.Document()
assert doc.is_empty
doc.add_text(title, "The Old Man and the Sea")
assert doc.get_first(title) == "The Old Man and the Sea"
assert doc.len == 1
assert not doc.is_empty