Merge pull request #1 from matrix-org/initial-bindings
Initial python bindings implementation.master
commit
9158a4fd7d
|
@ -1,10 +1,7 @@
|
||||||
# Generated by Cargo
|
/target
|
||||||
# will have compiled files and executables
|
|
||||||
/target/
|
|
||||||
|
|
||||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
|
||||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
|
||||||
Cargo.lock
|
|
||||||
|
|
||||||
# These are backup files generated by rustfmt
|
|
||||||
**/*.rs.bk
|
**/*.rs.bk
|
||||||
|
Cargo.lock
|
||||||
|
dist/
|
||||||
|
__pycache__/
|
||||||
|
tantivy.so
|
||||||
|
tantivy.egg-info/
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
[package]
|
||||||
|
name = "tantivy-py"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Damir Jelić <poljar@termina.org.uk>"]
|
||||||
|
edition = "2018"
|
||||||
|
license = "MIT"
|
||||||
|
|
||||||
|
[lib]
|
||||||
|
name = "tantivy"
|
||||||
|
crate-type = ["dylib"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
chrono = "0.4"
|
||||||
|
tantivy = { git = "https://github.com/tantivy-search/tantivy" }
|
||||||
|
|
||||||
|
[dependencies.pyo3]
|
||||||
|
version = "0.7.0"
|
||||||
|
features = ["extension-module"]
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2019 tantivy
|
Copyright (c) 2019 The Matrix.org Foundation CIC
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
include Cargo.toml
|
||||||
|
include Makefile
|
||||||
|
include rust-toolchain
|
||||||
|
recursive-include src *
|
|
@ -0,0 +1,17 @@
|
||||||
|
source_files := $(wildcard src/*.rs)
|
||||||
|
|
||||||
|
all: tantivy/tantivy.so
|
||||||
|
|
||||||
|
PHONY: test format
|
||||||
|
|
||||||
|
test: tantivy/tantivy.so
|
||||||
|
python3 -m pytest
|
||||||
|
|
||||||
|
format:
|
||||||
|
rustfmt src/*.rs
|
||||||
|
|
||||||
|
tantivy/tantivy.so: target/debug/libtantivy.so
|
||||||
|
cp target/debug/libtantivy.so tantivy/tantivy.so
|
||||||
|
|
||||||
|
target/debug/libtantivy.so: $(source_files)
|
||||||
|
cargo build
|
|
@ -0,0 +1,54 @@
|
||||||
|
tantivy-py
|
||||||
|
==========
|
||||||
|
|
||||||
|
Python bindings for tantivy.
|
||||||
|
|
||||||
|
|
||||||
|
# Installation
|
||||||
|
|
||||||
|
The bindings can be installed using setuptools:
|
||||||
|
|
||||||
|
python3 setup.py install --user
|
||||||
|
|
||||||
|
Note that this requires setuptools-rust to be installed. Another thing to note
|
||||||
|
is that the bindings are using [PyO3](https://github.com/PyO3/pyo3), which
|
||||||
|
requires rust nightly and only supports python3.
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
tantivy-py has a similar API to tantivy. To create a index first a schema
|
||||||
|
needs to be built. After that documents can be added to the index and a reader
|
||||||
|
can be created to search the index.
|
||||||
|
|
||||||
|
```python
|
||||||
|
builder = tantivy.SchemaBuilder()
|
||||||
|
|
||||||
|
title = builder.add_text_field("title", stored=True)
|
||||||
|
body = builder.add_text_field("body")
|
||||||
|
|
||||||
|
schema = builder.build()
|
||||||
|
index = tantivy.Index(schema)
|
||||||
|
|
||||||
|
writer = index.writer()
|
||||||
|
|
||||||
|
doc = tantivy.Document()
|
||||||
|
doc.add_text(title, "The Old Man and the Sea")
|
||||||
|
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
||||||
|
"the Gulf Stream and he had gone eighty-four days "
|
||||||
|
"now without taking a fish."))
|
||||||
|
writer.add_document(doc)
|
||||||
|
|
||||||
|
reader = index.reader()
|
||||||
|
searcher = reader.searcher()
|
||||||
|
|
||||||
|
query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
||||||
|
query = query_parser.parse_query("sea whale")
|
||||||
|
|
||||||
|
top_docs = tantivy.TopDocs(10)
|
||||||
|
result = searcher.search(query, top_docs)
|
||||||
|
|
||||||
|
_, doc_address = result[0]
|
||||||
|
|
||||||
|
searched_doc = searcher.doc(doc_address)
|
||||||
|
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
||||||
|
```
|
|
@ -0,0 +1,2 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools", "wheel", "setuptools-rust"]
|
|
@ -0,0 +1 @@
|
||||||
|
nightly-2019-05-22
|
|
@ -0,0 +1 @@
|
||||||
|
max_width = 80
|
|
@ -0,0 +1,15 @@
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
try:
|
||||||
|
from setuptools_rust import Binding, RustExtension
|
||||||
|
except ImportError:
|
||||||
|
print("Please install setuptools-rust package")
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="tantivy",
|
||||||
|
version="0.9.1",
|
||||||
|
rust_extensions=[RustExtension("tantivy.tantivy", binding=Binding.PyO3)],
|
||||||
|
packages=["tantivy"],
|
||||||
|
zip_safe=False,
|
||||||
|
)
|
|
@ -0,0 +1,152 @@
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
use pyo3::types::PyDateTime;
|
||||||
|
use pyo3::types::{PyDateAccess, PyTimeAccess};
|
||||||
|
|
||||||
|
use chrono::offset::TimeZone;
|
||||||
|
use chrono::Utc;
|
||||||
|
|
||||||
|
use tantivy as tv;
|
||||||
|
|
||||||
|
use crate::facet::Facet;
|
||||||
|
use crate::field::{Field, FieldValue};
|
||||||
|
|
||||||
|
/// Tantivy's Document is the object that can be indexed and then searched for.
|
||||||
|
///
|
||||||
|
/// Documents are fundamentally a collection of unordered tuples
|
||||||
|
/// (field, value). In this list, one field may appear more than once.
|
||||||
|
///
|
||||||
|
/// Example:
|
||||||
|
/// >>> doc = tantivy.Document()
|
||||||
|
/// >>> doc.add_text(title, "The Old Man and the Sea")
|
||||||
|
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
|
||||||
|
/// "skiff in the Gulf Stream and he had gone "
|
||||||
|
/// "eighty-four days now without taking a fish."))
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct Document {
|
||||||
|
pub(crate) inner: tv::Document,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Document {
|
||||||
|
#[new]
|
||||||
|
fn new(obj: &PyRawObject) {
|
||||||
|
obj.init(Document {
|
||||||
|
inner: tv::Document::default(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a text value to the document.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we are adding the text.
|
||||||
|
/// text (str): The text that will be added to the document.
|
||||||
|
fn add_text(&mut self, field: &Field, text: &str) {
|
||||||
|
self.inner.add_text(field.inner, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add an unsigned integer value to the document.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we are adding the integer.
|
||||||
|
/// value (int): The integer that will be added to the document.
|
||||||
|
fn add_unsigned(&mut self, field: &Field, value: u64) {
|
||||||
|
self.inner.add_u64(field.inner, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a signed integer value to the document.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we are adding the integer.
|
||||||
|
/// value (int): The integer that will be added to the document.
|
||||||
|
fn add_integer(&mut self, field: &Field, value: i64) {
|
||||||
|
self.inner.add_i64(field.inner, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a date value to the document.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we are adding the integer.
|
||||||
|
/// value (datetime): The date that will be added to the document.
|
||||||
|
fn add_date(&mut self, field: &Field, value: &PyDateTime) {
|
||||||
|
let datetime = Utc
|
||||||
|
.ymd(
|
||||||
|
value.get_year().into(),
|
||||||
|
value.get_month().into(),
|
||||||
|
value.get_day().into(),
|
||||||
|
)
|
||||||
|
.and_hms_micro(
|
||||||
|
value.get_hour().into(),
|
||||||
|
value.get_minute().into(),
|
||||||
|
value.get_second().into(),
|
||||||
|
value.get_microsecond().into(),
|
||||||
|
);
|
||||||
|
|
||||||
|
self.inner.add_date(field.inner, &datetime);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a facet value to the document.
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we are adding the facet.
|
||||||
|
/// value (Facet): The Facet that will be added to the document.
|
||||||
|
fn add_facet(&mut self, field: &Field, value: &Facet) {
|
||||||
|
self.inner.add_facet(field.inner, value.inner.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a bytes value to the document.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we are adding the bytes.
|
||||||
|
/// value (bytes): The bytes that will be added to the document.
|
||||||
|
fn add_bytes(&mut self, field: &Field, value: Vec<u8>) {
|
||||||
|
self.inner.add_bytes(field.inner, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of added fields that have been added to the document
|
||||||
|
#[getter]
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.inner.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if the document is empty, False otherwise.
|
||||||
|
#[getter]
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
self.inner.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the first value associated with the given field.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we would like to get the value.
|
||||||
|
///
|
||||||
|
/// Returns the value if one is found, otherwise None.
|
||||||
|
/// The type of the value depends on the field.
|
||||||
|
fn get_first(&self, py: Python, field: &Field) -> Option<PyObject> {
|
||||||
|
let value = self.inner.get_first(field.inner)?;
|
||||||
|
FieldValue::value_to_py(py, value)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the all values associated with the given field.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which we would like to get the values.
|
||||||
|
///
|
||||||
|
/// Returns a list of values.
|
||||||
|
/// The type of the value depends on the field.
|
||||||
|
fn get_all(&self, py: Python, field: &Field) -> Vec<PyObject> {
|
||||||
|
let values = self.inner.get_all(field.inner);
|
||||||
|
values
|
||||||
|
.iter()
|
||||||
|
.map(|&v| FieldValue::value_to_py(py, v))
|
||||||
|
.filter_map(|x| x)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all the fields and values contained in the document.
|
||||||
|
fn field_values(&self, py: Python) -> Vec<FieldValue> {
|
||||||
|
let field_values = self.inner.field_values();
|
||||||
|
field_values
|
||||||
|
.iter()
|
||||||
|
.map(|v| FieldValue::field_value_to_py(py, v))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
use pyo3::types::PyType;
|
||||||
|
|
||||||
|
use tantivy::schema;
|
||||||
|
|
||||||
|
/// A Facet represent a point in a given hierarchy.
|
||||||
|
///
|
||||||
|
/// They are typically represented similarly to a filepath. For instance, an
|
||||||
|
/// e-commerce website could have a Facet for /electronics/tv_and_video/led_tv.
|
||||||
|
///
|
||||||
|
/// A document can be associated to any number of facets. The hierarchy
|
||||||
|
/// implicitely imply that a document belonging to a facet also belongs to the
|
||||||
|
/// ancestor of its facet. In the example above, /electronics/tv_and_video/
|
||||||
|
/// and /electronics.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct Facet {
|
||||||
|
pub(crate) inner: schema::Facet,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Facet {
|
||||||
|
/// Create a new instance of the "root facet" Equivalent to /.
|
||||||
|
#[classmethod]
|
||||||
|
fn root(_cls: &PyType) -> Facet {
|
||||||
|
Facet {
|
||||||
|
inner: schema::Facet::root(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the facet is the root facet /.
|
||||||
|
#[getter]
|
||||||
|
fn is_root(&self) -> bool {
|
||||||
|
self.inner.is_root()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if another Facet is a subfacet of this facet.
|
||||||
|
/// Args:
|
||||||
|
/// other (Facet): The Facet that we should check if this facet is a
|
||||||
|
/// subset of.
|
||||||
|
fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||||
|
self.inner.is_prefix_of(&other.inner)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a Facet object from a string.
|
||||||
|
/// Args:
|
||||||
|
/// facet_string (str): The string that contains a facet.
|
||||||
|
///
|
||||||
|
/// Returns the created Facet.
|
||||||
|
#[classmethod]
|
||||||
|
fn from_string(_cls: &PyType, facet_string: &str) -> Facet {
|
||||||
|
Facet {
|
||||||
|
inner: schema::Facet::from_text(facet_string),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,72 @@
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
use pyo3::types::PyDateTime;
|
||||||
|
|
||||||
|
use tantivy::schema;
|
||||||
|
|
||||||
|
use crate::facet::Facet;
|
||||||
|
|
||||||
|
/// Field is a numeric indentifier that represents an entry in the Schema.
|
||||||
|
#[pyclass]
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub(crate) struct Field {
|
||||||
|
pub(crate) inner: schema::Field,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// FieldValue holds together a Field and its Value.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct FieldValue {
|
||||||
|
pub(crate) field: Field,
|
||||||
|
pub(crate) value: PyObject,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl FieldValue {
|
||||||
|
#[getter]
|
||||||
|
fn field(&self) -> Field {
|
||||||
|
self.field.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn value(&self) -> &PyObject {
|
||||||
|
&self.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FieldValue {
|
||||||
|
pub(crate) fn value_to_py(
|
||||||
|
py: Python,
|
||||||
|
value: &schema::Value,
|
||||||
|
) -> Option<PyObject> {
|
||||||
|
match value {
|
||||||
|
schema::Value::Str(text) => Some(text.into_object(py)),
|
||||||
|
schema::Value::U64(num) => Some(num.into_object(py)),
|
||||||
|
schema::Value::I64(num) => Some(num.into_object(py)),
|
||||||
|
schema::Value::Bytes(b) => Some(b.to_object(py)),
|
||||||
|
schema::Value::Date(d) => {
|
||||||
|
let date =
|
||||||
|
PyDateTime::from_timestamp(py, d.timestamp() as f64, None);
|
||||||
|
|
||||||
|
match date {
|
||||||
|
Ok(d) => Some(d.into_object(py)),
|
||||||
|
Err(_e) => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
schema::Value::Facet(f) => {
|
||||||
|
Some(Facet { inner: f.clone() }.into_object(py))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn field_value_to_py(
|
||||||
|
py: Python,
|
||||||
|
field_value: &schema::FieldValue,
|
||||||
|
) -> FieldValue {
|
||||||
|
let value = field_value.value();
|
||||||
|
let field = field_value.field();
|
||||||
|
|
||||||
|
FieldValue {
|
||||||
|
field: Field { inner: field },
|
||||||
|
value: FieldValue::value_to_py(py, value).unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,284 @@
|
||||||
|
use pyo3::exceptions;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
use crate::document::Document;
|
||||||
|
use crate::schema::Schema;
|
||||||
|
use crate::searcher::Searcher;
|
||||||
|
use tantivy as tv;
|
||||||
|
use tantivy::directory::MmapDirectory;
|
||||||
|
|
||||||
|
const RELOAD_POLICY: &str = "commit";
|
||||||
|
|
||||||
|
/// IndexReader is the entry point to read and search the index.
|
||||||
|
///
|
||||||
|
/// IndexReader controls when a new version of the index should be loaded and
|
||||||
|
/// lends you instances of Searcher for the last loaded version.
|
||||||
|
///
|
||||||
|
/// To create an IndexReader first create an Index and call the reader() method
|
||||||
|
/// on the index object.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct IndexReader {
|
||||||
|
inner: tv::IndexReader,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl IndexReader {
|
||||||
|
/// Update searchers so that they reflect the state of the last .commit().
|
||||||
|
///
|
||||||
|
/// If you set up the the reload policy to be on 'commit' (which is the
|
||||||
|
/// default) every commit should be rapidly reflected on your IndexReader
|
||||||
|
/// and you should not need to call reload() at all.
|
||||||
|
fn reload(&self) -> PyResult<()> {
|
||||||
|
let ret = self.inner.reload();
|
||||||
|
match ret {
|
||||||
|
Ok(_) => Ok(()),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a Searcher for the index.
|
||||||
|
///
|
||||||
|
/// This method should be called every single time a search query is
|
||||||
|
/// performed. The searchers are taken from a pool of num_searchers
|
||||||
|
/// searchers.
|
||||||
|
///
|
||||||
|
/// Returns a Searcher object, if no searcher is available this may block.
|
||||||
|
fn searcher(&self) -> Searcher {
|
||||||
|
let searcher = self.inner.searcher();
|
||||||
|
Searcher { inner: searcher }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// IndexWriter is the user entry-point to add documents to the index.
|
||||||
|
///
|
||||||
|
/// To create an IndexWriter first create an Index and call the writer() method
|
||||||
|
/// on the index object.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct IndexWriter {
|
||||||
|
inner: tv::IndexWriter,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl IndexWriter {
|
||||||
|
/// Add a document to the index.
|
||||||
|
///
|
||||||
|
/// If the indexing pipeline is full, this call may block.
|
||||||
|
///
|
||||||
|
/// Returns an `opstamp`, which is an increasing integer that can be used
|
||||||
|
/// by the client to align commits with its own document queue.
|
||||||
|
/// The `opstamp` represents the number of documents that have been added
|
||||||
|
/// since the creation of the index.
|
||||||
|
fn add_document(&mut self, document: &Document) -> PyResult<()> {
|
||||||
|
self.inner.add_document(document.inner.clone());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commits all of the pending changes
|
||||||
|
///
|
||||||
|
/// A call to commit blocks. After it returns, all of the document that
|
||||||
|
/// were added since the last commit are published and persisted.
|
||||||
|
///
|
||||||
|
/// In case of a crash or an hardware failure (as long as the hard disk is
|
||||||
|
/// spared), it will be possible to resume indexing from this point.
|
||||||
|
///
|
||||||
|
/// Returns the `opstamp` of the last document that made it in the commit.
|
||||||
|
fn commit(&mut self) -> PyResult<()> {
|
||||||
|
let ret = self.inner.commit();
|
||||||
|
match ret {
|
||||||
|
Ok(_) => Ok(()),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rollback to the last commit
|
||||||
|
///
|
||||||
|
/// This cancels all of the update that happened before after the last
|
||||||
|
/// commit. After calling rollback, the index is in the same state as it
|
||||||
|
/// was after the last commit.
|
||||||
|
fn rollback(&mut self) -> PyResult<()> {
|
||||||
|
let ret = self.inner.rollback();
|
||||||
|
|
||||||
|
match ret {
|
||||||
|
Ok(_) => Ok(()),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detect and removes the files that are not used by the index anymore.
|
||||||
|
fn garbage_collect_files(&mut self) -> PyResult<()> {
|
||||||
|
let ret = self.inner.garbage_collect_files();
|
||||||
|
|
||||||
|
match ret {
|
||||||
|
Ok(_) => Ok(()),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The opstamp of the last successful commit.
|
||||||
|
///
|
||||||
|
/// This is the opstamp the index will rollback to if there is a failure
|
||||||
|
/// like a power surge.
|
||||||
|
///
|
||||||
|
/// This is also the opstamp of the commit that is currently available
|
||||||
|
/// for searchers.
|
||||||
|
#[getter]
|
||||||
|
fn commit_opstamp(&self) -> u64 {
|
||||||
|
self.inner.commit_opstamp()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new index object.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// schema (Schema): The schema of the index.
|
||||||
|
/// path (str, optional): The path where the index should be stored. If
|
||||||
|
/// no path is provided, the index will be stored in memory.
|
||||||
|
/// reuse (bool, optional): Should we open an existing index if one exists
|
||||||
|
/// or always create a new one.
|
||||||
|
///
|
||||||
|
/// If an index already exists it will be opened and reused. Raises OSError
|
||||||
|
/// if there was a problem during the opening or creation of the index.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct Index {
|
||||||
|
pub(crate) inner: tv::Index,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Index {
|
||||||
|
#[new]
|
||||||
|
#[args(reuse = true)]
|
||||||
|
fn new(
|
||||||
|
obj: &PyRawObject,
|
||||||
|
schema: &Schema,
|
||||||
|
path: Option<&str>,
|
||||||
|
reuse: bool,
|
||||||
|
) -> PyResult<()> {
|
||||||
|
let index = match path {
|
||||||
|
Some(p) => {
|
||||||
|
let directory = MmapDirectory::open(p);
|
||||||
|
|
||||||
|
let dir = match directory {
|
||||||
|
Ok(d) => d,
|
||||||
|
Err(e) => {
|
||||||
|
return Err(exceptions::OSError::py_err(e.to_string()))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let i = if reuse {
|
||||||
|
tv::Index::open_or_create(dir, schema.inner.clone())
|
||||||
|
} else {
|
||||||
|
tv::Index::create(dir, schema.inner.clone())
|
||||||
|
};
|
||||||
|
|
||||||
|
match i {
|
||||||
|
Ok(index) => index,
|
||||||
|
Err(e) => {
|
||||||
|
return Err(exceptions::OSError::py_err(e.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => tv::Index::create_in_ram(schema.inner.clone()),
|
||||||
|
};
|
||||||
|
|
||||||
|
obj.init(Index { inner: index });
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a `IndexWriter` for the index.
|
||||||
|
///
|
||||||
|
/// The writer will be multithreaded and the provided heap size will be
|
||||||
|
/// split between the given number of threads.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// overall_heap_size (int, optional): The total target memory usage of
|
||||||
|
/// the writer, can't be less than 3000000.
|
||||||
|
/// num_threads (int, optional): The number of threads that the writer
|
||||||
|
/// should use. If this value is 0, tantivy will choose
|
||||||
|
/// automatically the number of threads.
|
||||||
|
///
|
||||||
|
/// Raises ValueError if there was an error while creating the writer.
|
||||||
|
#[args(heap_size = 3000000, num_threads = 0)]
|
||||||
|
fn writer(
|
||||||
|
&self,
|
||||||
|
heap_size: usize,
|
||||||
|
num_threads: usize,
|
||||||
|
) -> PyResult<IndexWriter> {
|
||||||
|
let writer = match num_threads {
|
||||||
|
0 => self.inner.writer(heap_size),
|
||||||
|
_ => self.inner.writer_with_num_threads(num_threads, heap_size),
|
||||||
|
};
|
||||||
|
|
||||||
|
match writer {
|
||||||
|
Ok(w) => Ok(IndexWriter { inner: w }),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an IndexReader for the index.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// reload_policy (str, optional): The reload policy that the
|
||||||
|
/// IndexReader should use. Can be manual or OnCommit.
|
||||||
|
/// num_searchers (int, optional): The number of searchers that the
|
||||||
|
/// reader should create.
|
||||||
|
///
|
||||||
|
/// Returns the IndexReader on success, raises ValueError if a IndexReader
|
||||||
|
/// couldn't be created.
|
||||||
|
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
|
||||||
|
fn reader(
|
||||||
|
&self,
|
||||||
|
reload_policy: &str,
|
||||||
|
num_searchers: usize,
|
||||||
|
) -> PyResult<IndexReader> {
|
||||||
|
let reload_policy = reload_policy.to_lowercase();
|
||||||
|
let reload_policy = match reload_policy.as_ref() {
|
||||||
|
"commit" => tv::ReloadPolicy::OnCommit,
|
||||||
|
"on-commit" => tv::ReloadPolicy::OnCommit,
|
||||||
|
"oncommit" => tv::ReloadPolicy::OnCommit,
|
||||||
|
"manual" => tv::ReloadPolicy::Manual,
|
||||||
|
_ => return Err(exceptions::ValueError::py_err(
|
||||||
|
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
|
||||||
|
))
|
||||||
|
};
|
||||||
|
|
||||||
|
let builder = self.inner.reader_builder();
|
||||||
|
|
||||||
|
let builder = builder.reload_policy(reload_policy);
|
||||||
|
let builder = if num_searchers > 0 {
|
||||||
|
builder.num_searchers(num_searchers)
|
||||||
|
} else {
|
||||||
|
builder
|
||||||
|
};
|
||||||
|
|
||||||
|
let reader = builder.try_into();
|
||||||
|
match reader {
|
||||||
|
Ok(r) => Ok(IndexReader { inner: r }),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if the given path contains an existing index.
|
||||||
|
/// Args:
|
||||||
|
/// path: The path where tantivy will search for an index.
|
||||||
|
///
|
||||||
|
/// Returns True if an index exists at the given path, False otherwise.
|
||||||
|
///
|
||||||
|
/// Raises OSError if the directory cannot be opened.
|
||||||
|
#[staticmethod]
|
||||||
|
fn exists(path: &str) -> PyResult<bool> {
|
||||||
|
let directory = MmapDirectory::open(path);
|
||||||
|
let dir = match directory {
|
||||||
|
Ok(d) => d,
|
||||||
|
Err(e) => return Err(exceptions::OSError::py_err(e.to_string())),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(tv::Index::exists(&dir))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The schema of the current index.
|
||||||
|
#[getter]
|
||||||
|
fn schema(&self) -> Schema {
|
||||||
|
let schema = self.inner.schema();
|
||||||
|
Schema { inner: schema }
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,88 @@
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
mod document;
|
||||||
|
mod facet;
|
||||||
|
mod field;
|
||||||
|
mod index;
|
||||||
|
mod query;
|
||||||
|
mod schema;
|
||||||
|
mod schemabuilder;
|
||||||
|
mod searcher;
|
||||||
|
|
||||||
|
use document::Document;
|
||||||
|
use facet::Facet;
|
||||||
|
use field::{Field, FieldValue};
|
||||||
|
use index::Index;
|
||||||
|
use query::QueryParser;
|
||||||
|
use schema::Schema;
|
||||||
|
use schemabuilder::SchemaBuilder;
|
||||||
|
use searcher::{DocAddress, Searcher, TopDocs};
|
||||||
|
|
||||||
|
/// Python bindings for the search engine library Tantivy.
|
||||||
|
///
|
||||||
|
/// Tantivy is a full text search engine library written in rust.
|
||||||
|
///
|
||||||
|
/// It is closer to Apache Lucene than to Elasticsearch and Apache Solr in
|
||||||
|
/// the sense it is not an off-the-shelf search engine server, but rather
|
||||||
|
/// a library that can be used to build such a search engine.
|
||||||
|
/// Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||||
|
///
|
||||||
|
/// Example:
|
||||||
|
/// >>> import json
|
||||||
|
/// >>> import tantivy
|
||||||
|
///
|
||||||
|
/// >>> builder = tantivy.SchemaBuilder()
|
||||||
|
///
|
||||||
|
/// >>> title = builder.add_text_field("title", stored=True)
|
||||||
|
/// >>> body = builder.add_text_field("body")
|
||||||
|
///
|
||||||
|
/// >>> schema = builder.build()
|
||||||
|
/// >>> index = tantivy.Index(schema)
|
||||||
|
/// >>> doc = tantivy.Document()
|
||||||
|
/// >>> doc.add_text(title, "The Old Man and the Sea")
|
||||||
|
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
|
||||||
|
/// "skiff in the Gulf Stream and he had gone "
|
||||||
|
/// "eighty-four days now without taking a fish."))
|
||||||
|
///
|
||||||
|
/// >>> writer.add_document(doc)
|
||||||
|
///
|
||||||
|
/// >>> doc = schema.parse_document(json.dumps({
|
||||||
|
/// "title": ["Frankenstein", "The Modern Prometheus"],
|
||||||
|
/// "body": ("You will rejoice to hear that no disaster has "
|
||||||
|
/// "accompanied the commencement of an enterprise which "
|
||||||
|
/// "you have regarded with such evil forebodings. "
|
||||||
|
/// "I arrived here yesterday, and my first task is to "
|
||||||
|
/// "assure my dear sister of my welfare and increasing "
|
||||||
|
/// "confidence in the success of my undertaking.")
|
||||||
|
/// }))
|
||||||
|
///
|
||||||
|
/// >>> writer.add_document(doc)
|
||||||
|
/// >>> writer.commit()
|
||||||
|
///
|
||||||
|
/// >>> reader = index.reader()
|
||||||
|
/// >>> searcher = reader.searcher()
|
||||||
|
///
|
||||||
|
/// >>> query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
||||||
|
/// >>> query = query_parser.parse_query("sea whale")
|
||||||
|
///
|
||||||
|
/// >>> top_docs = tantivy.TopDocs.with_limit(10)
|
||||||
|
/// >>> result = searcher.search(query, top_docs)
|
||||||
|
///
|
||||||
|
/// >>> assert len(result) == 1
|
||||||
|
///
|
||||||
|
#[pymodule]
|
||||||
|
fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||||
|
m.add_class::<Schema>()?;
|
||||||
|
m.add_class::<SchemaBuilder>()?;
|
||||||
|
m.add_class::<Searcher>()?;
|
||||||
|
m.add_class::<Index>()?;
|
||||||
|
m.add_class::<QueryParser>()?;
|
||||||
|
m.add_class::<Document>()?;
|
||||||
|
m.add_class::<DocAddress>()?;
|
||||||
|
m.add_class::<TopDocs>()?;
|
||||||
|
m.add_class::<Field>()?;
|
||||||
|
m.add_class::<FieldValue>()?;
|
||||||
|
m.add_class::<Facet>()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
use pyo3::exceptions;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
use pyo3::types::PyType;
|
||||||
|
|
||||||
|
use tantivy as tv;
|
||||||
|
|
||||||
|
use crate::field::Field;
|
||||||
|
use crate::index::Index;
|
||||||
|
|
||||||
|
/// Tantivy's Query
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct Query {
|
||||||
|
pub(crate) inner: Box<dyn tv::query::Query>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tantivy's Query parser
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct QueryParser {
|
||||||
|
inner: tv::query::QueryParser,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl QueryParser {
|
||||||
|
/// Creates a QueryParser for an Index.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// index (Index): The index for which the query will be created.
|
||||||
|
/// default_fields (List[Field]): A list of fields used to search if no
|
||||||
|
/// field is specified in the query.
|
||||||
|
///
|
||||||
|
/// Returns the QueryParser.
|
||||||
|
#[classmethod]
|
||||||
|
fn for_index(
|
||||||
|
_cls: &PyType,
|
||||||
|
index: &Index,
|
||||||
|
default_fields: Vec<&Field>,
|
||||||
|
) -> PyResult<QueryParser> {
|
||||||
|
let default_fields: Vec<tv::schema::Field> =
|
||||||
|
default_fields.iter().map(|&f| f.inner.clone()).collect();
|
||||||
|
|
||||||
|
let parser =
|
||||||
|
tv::query::QueryParser::for_index(&index.inner, default_fields);
|
||||||
|
Ok(QueryParser { inner: parser })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a string into a query that can be given to a searcher.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// query (str): A query string that should be parsed into a query.
|
||||||
|
///
|
||||||
|
/// Returns the parsed Query object. Raises ValueError if there was an
|
||||||
|
/// error with the query string.
|
||||||
|
fn parse_query(&self, query: &str) -> PyResult<Query> {
|
||||||
|
let ret = self.inner.parse_query(query);
|
||||||
|
|
||||||
|
match ret {
|
||||||
|
Ok(q) => Ok(Query { inner: q }),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the default way to compose queries to a conjunction.
|
||||||
|
///
|
||||||
|
/// By default, the query happy tax payer is equivalent to the query happy
|
||||||
|
/// OR tax OR payer. After calling .set_conjunction_by_default() happy tax
|
||||||
|
/// payer will be interpreted by the parser as happy AND tax AND payer.
|
||||||
|
fn set_conjunction_by_default(&mut self) {
|
||||||
|
self.inner.set_conjunction_by_default();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
use pyo3::exceptions;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
use tantivy::schema;
|
||||||
|
|
||||||
|
use crate::document::Document;
|
||||||
|
use crate::field::Field;
|
||||||
|
|
||||||
|
/// Tantivy schema.
|
||||||
|
///
|
||||||
|
/// The schema is very strict. To build the schema the `SchemaBuilder` class is
|
||||||
|
/// provided.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct Schema {
|
||||||
|
pub(crate) inner: schema::Schema,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Schema {
|
||||||
|
/// Build a document object from a json string.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// doc_json (str) - A string containing json that should be parsed
|
||||||
|
/// into a `Document`
|
||||||
|
///
|
||||||
|
/// Returns the parsed document, raises a ValueError if the parsing failed.
|
||||||
|
fn parse_document(&self, doc_json: &str) -> PyResult<Document> {
|
||||||
|
let ret = self.inner.parse_document(doc_json);
|
||||||
|
match ret {
|
||||||
|
Ok(d) => Ok(Document { inner: d }),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert a `Document` object into a json string.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// doc (Document): The document that will be converted into a json
|
||||||
|
/// string.
|
||||||
|
fn to_json(&self, doc: &Document) -> String {
|
||||||
|
self.inner.to_json(&doc.inner)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the field name for a given `Field`.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// field (Field): The field for which the name will be returned.
|
||||||
|
fn get_field_name(&self, field: &Field) -> &str {
|
||||||
|
self.inner.get_field_name(field.inner)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the field option associated with a given name.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// name (str): The name of the field that we want to retrieve.
|
||||||
|
///
|
||||||
|
/// Returns the Field if one is found, None otherwise.
|
||||||
|
fn get_field(&self, name: &str) -> Option<Field> {
|
||||||
|
let f = self.inner.get_field(name);
|
||||||
|
match f {
|
||||||
|
Some(field) => Some(Field { inner: field }),
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,318 @@
|
||||||
|
use pyo3::exceptions;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
use tantivy::schema;
|
||||||
|
|
||||||
|
use crate::field::Field;
|
||||||
|
use crate::schema::Schema;
|
||||||
|
|
||||||
|
/// Tantivy has a very strict schema.
|
||||||
|
/// You need to specify in advance whether a field is indexed or not,
|
||||||
|
/// stored or not.
|
||||||
|
///
|
||||||
|
/// This is done by creating a schema object, and
|
||||||
|
/// setting up the fields one by one.
|
||||||
|
///
|
||||||
|
/// Examples:
|
||||||
|
///
|
||||||
|
/// >>> builder = tantivy.SchemaBuilder()
|
||||||
|
///
|
||||||
|
/// >>> title = builder.add_text_field("title", stored=True)
|
||||||
|
/// >>> body = builder.add_text_field("body")
|
||||||
|
///
|
||||||
|
/// >>> schema = builder.build()
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct SchemaBuilder {
|
||||||
|
pub(crate) builder: Option<schema::SchemaBuilder>,
|
||||||
|
}
|
||||||
|
|
||||||
|
const TOKENIZER: &str = "default";
|
||||||
|
const RECORD: &str = "position";
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl SchemaBuilder {
|
||||||
|
#[new]
|
||||||
|
fn new(obj: &PyRawObject) {
|
||||||
|
obj.init(SchemaBuilder {
|
||||||
|
builder: Some(schema::Schema::builder()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a new text field to the schema.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// name (str): The name of the field.
|
||||||
|
/// stored (bool, optional): If true sets the field as stored, the
|
||||||
|
/// content of the field can be later restored from a Searcher.
|
||||||
|
/// Defaults to False.
|
||||||
|
/// tokenizer_name (str, optional): The name of the tokenizer that
|
||||||
|
/// should be used to process the field. Defaults to 'default'
|
||||||
|
/// index_option (str, optional): Sets which information should be
|
||||||
|
/// indexed with the tokens. Can be one of 'position', 'freq' or
|
||||||
|
/// 'basic'. Defaults to 'position'. The 'basic' index_option
|
||||||
|
/// records only the document ID, the 'freq' option records the
|
||||||
|
/// document id and the term frequency, while the 'position' option
|
||||||
|
/// records the document id, term frequency and the positions of
|
||||||
|
/// the term occurrences in the document.
|
||||||
|
///
|
||||||
|
/// Returns the associated field handle.
|
||||||
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
|
#[args(
|
||||||
|
stored = false,
|
||||||
|
tokenizer_name = "TOKENIZER",
|
||||||
|
index_option = "RECORD"
|
||||||
|
)]
|
||||||
|
fn add_text_field(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
stored: bool,
|
||||||
|
tokenizer_name: &str,
|
||||||
|
index_option: &str,
|
||||||
|
) -> PyResult<Field> {
|
||||||
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
|
let index_option = match index_option {
|
||||||
|
"position" => schema::IndexRecordOption::WithFreqsAndPositions,
|
||||||
|
"freq" => schema::IndexRecordOption::WithFreqs,
|
||||||
|
"basic" => schema::IndexRecordOption::Basic,
|
||||||
|
_ => return Err(exceptions::ValueError::py_err(
|
||||||
|
"Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
|
||||||
|
))
|
||||||
|
};
|
||||||
|
|
||||||
|
let indexing = schema::TextFieldIndexing::default()
|
||||||
|
.set_tokenizer(tokenizer_name)
|
||||||
|
.set_index_option(index_option);
|
||||||
|
|
||||||
|
let options =
|
||||||
|
schema::TextOptions::default().set_indexing_options(indexing);
|
||||||
|
let options = if stored {
|
||||||
|
options.set_stored()
|
||||||
|
} else {
|
||||||
|
options
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(builder) = builder {
|
||||||
|
let field = builder.add_text_field(name, options);
|
||||||
|
Ok(Field { inner: field })
|
||||||
|
} else {
|
||||||
|
Err(exceptions::ValueError::py_err(
|
||||||
|
"Schema builder object isn't valid anymore.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a new signed integer field to the schema.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// name (str): The name of the field.
|
||||||
|
/// stored (bool, optional): If true sets the field as stored, the
|
||||||
|
/// content of the field can be later restored from a Searcher.
|
||||||
|
/// Defaults to False.
|
||||||
|
/// indexed (bool, optional): If true sets the field to be indexed.
|
||||||
|
/// fast (str, optional): Set the u64 options as a single-valued fast
|
||||||
|
/// field. Fast fields are designed for random access. Access time
|
||||||
|
/// are similar to a random lookup in an array. If more than one
|
||||||
|
/// value is associated to a fast field, only the last one is kept.
|
||||||
|
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
||||||
|
/// the document must have exactly one value associated to the
|
||||||
|
/// document. If this is set to 'multi', the document can have any
|
||||||
|
/// number of values associated to the document. Defaults to None,
|
||||||
|
/// which disables this option.
|
||||||
|
///
|
||||||
|
/// Returns the associated field handle.
|
||||||
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
|
#[args(stored = false, indexed = false)]
|
||||||
|
fn add_integer_field(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
stored: bool,
|
||||||
|
indexed: bool,
|
||||||
|
fast: Option<&str>,
|
||||||
|
) -> PyResult<Field> {
|
||||||
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
|
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||||
|
|
||||||
|
if let Some(builder) = builder {
|
||||||
|
let field = builder.add_i64_field(name, opts);
|
||||||
|
Ok(Field { inner: field })
|
||||||
|
} else {
|
||||||
|
Err(exceptions::ValueError::py_err(
|
||||||
|
"Schema builder object isn't valid anymore.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a new unsigned integer field to the schema.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// name (str): The name of the field.
|
||||||
|
/// stored (bool, optional): If true sets the field as stored, the
|
||||||
|
/// content of the field can be later restored from a Searcher.
|
||||||
|
/// Defaults to False.
|
||||||
|
/// indexed (bool, optional): If true sets the field to be indexed.
|
||||||
|
/// fast (str, optional): Set the u64 options as a single-valued fast
|
||||||
|
/// field. Fast fields are designed for random access. Access time
|
||||||
|
/// are similar to a random lookup in an array. If more than one
|
||||||
|
/// value is associated to a fast field, only the last one is kept.
|
||||||
|
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
||||||
|
/// the document must have exactly one value associated to the
|
||||||
|
/// document. If this is set to 'multi', the document can have any
|
||||||
|
/// number of values associated to the document. Defaults to None,
|
||||||
|
/// which disables this option.
|
||||||
|
///
|
||||||
|
/// Returns the associated field handle.
|
||||||
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
|
#[args(stored = false, indexed = false)]
|
||||||
|
fn add_unsigned_field(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
stored: bool,
|
||||||
|
indexed: bool,
|
||||||
|
fast: Option<&str>,
|
||||||
|
) -> PyResult<Field> {
|
||||||
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
|
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||||
|
|
||||||
|
if let Some(builder) = builder {
|
||||||
|
let field = builder.add_u64_field(name, opts);
|
||||||
|
Ok(Field { inner: field })
|
||||||
|
} else {
|
||||||
|
Err(exceptions::ValueError::py_err(
|
||||||
|
"Schema builder object isn't valid anymore.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a new date field to the schema.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// name (str): The name of the field.
|
||||||
|
/// stored (bool, optional): If true sets the field as stored, the
|
||||||
|
/// content of the field can be later restored from a Searcher.
|
||||||
|
/// Defaults to False.
|
||||||
|
/// indexed (bool, optional): If true sets the field to be indexed.
|
||||||
|
/// fast (str, optional): Set the u64 options as a single-valued fast
|
||||||
|
/// field. Fast fields are designed for random access. Access time
|
||||||
|
/// are similar to a random lookup in an array. If more than one
|
||||||
|
/// value is associated to a fast field, only the last one is kept.
|
||||||
|
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
||||||
|
/// the document must have exactly one value associated to the
|
||||||
|
/// document. If this is set to 'multi', the document can have any
|
||||||
|
/// number of values associated to the document. Defaults to None,
|
||||||
|
/// which disables this option.
|
||||||
|
///
|
||||||
|
/// Returns the associated field handle.
|
||||||
|
/// Raises a ValueError if there was an error with the field creation.
|
||||||
|
#[args(stored = false, indexed = false)]
|
||||||
|
fn add_date_field(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
stored: bool,
|
||||||
|
indexed: bool,
|
||||||
|
fast: Option<&str>,
|
||||||
|
) -> PyResult<Field> {
|
||||||
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
|
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||||
|
|
||||||
|
if let Some(builder) = builder {
|
||||||
|
let field = builder.add_date_field(name, opts);
|
||||||
|
Ok(Field { inner: field })
|
||||||
|
} else {
|
||||||
|
Err(exceptions::ValueError::py_err(
|
||||||
|
"Schema builder object isn't valid anymore.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a Facet field to the schema.
|
||||||
|
/// Args:
|
||||||
|
/// name (str): The name of the field.
|
||||||
|
fn add_facet_field(&mut self, name: &str) -> PyResult<Field> {
|
||||||
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
|
if let Some(builder) = builder {
|
||||||
|
let field = builder.add_facet_field(name);
|
||||||
|
Ok(Field { inner: field })
|
||||||
|
} else {
|
||||||
|
Err(exceptions::ValueError::py_err(
|
||||||
|
"Schema builder object isn't valid anymore.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a fast bytes field to the schema.
|
||||||
|
///
|
||||||
|
/// Bytes field are not searchable and are only used
|
||||||
|
/// as fast field, to associate any kind of payload
|
||||||
|
/// to a document.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// name (str): The name of the field.
|
||||||
|
fn add_bytes_field(&mut self, name: &str) -> PyResult<Field> {
|
||||||
|
let builder = &mut self.builder;
|
||||||
|
|
||||||
|
if let Some(builder) = builder {
|
||||||
|
let field = builder.add_bytes_field(name);
|
||||||
|
Ok(Field { inner: field })
|
||||||
|
} else {
|
||||||
|
Err(exceptions::ValueError::py_err(
|
||||||
|
"Schema builder object isn't valid anymore.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finalize the creation of a Schema.
|
||||||
|
///
|
||||||
|
/// Returns a Schema object. After this is called the SchemaBuilder cannot
|
||||||
|
/// be used anymore.
|
||||||
|
fn build(&mut self) -> PyResult<Schema> {
|
||||||
|
let builder = self.builder.take();
|
||||||
|
if let Some(builder) = builder {
|
||||||
|
let schema = builder.build();
|
||||||
|
Ok(Schema { inner: schema })
|
||||||
|
} else {
|
||||||
|
Err(exceptions::ValueError::py_err(
|
||||||
|
"Schema builder object isn't valid anymore.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SchemaBuilder {
|
||||||
|
fn build_int_option(
|
||||||
|
stored: bool,
|
||||||
|
indexed: bool,
|
||||||
|
fast: Option<&str>,
|
||||||
|
) -> PyResult<schema::IntOptions> {
|
||||||
|
let opts = schema::IntOptions::default();
|
||||||
|
|
||||||
|
let opts = if stored { opts.set_stored() } else { opts };
|
||||||
|
let opts = if indexed { opts.set_indexed() } else { opts };
|
||||||
|
|
||||||
|
let fast = match fast {
|
||||||
|
Some(f) => {
|
||||||
|
let f = f.to_lowercase();
|
||||||
|
match f.as_ref() {
|
||||||
|
"single" => Some(schema::Cardinality::SingleValue),
|
||||||
|
"multi" => Some(schema::Cardinality::MultiValues),
|
||||||
|
_ => return Err(exceptions::ValueError::py_err(
|
||||||
|
"Invalid index option, valid choices are: 'multivalue' and 'singlevalue'"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let opts = if let Some(f) = fast {
|
||||||
|
opts.set_fast(f)
|
||||||
|
} else {
|
||||||
|
opts
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(opts)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,134 @@
|
||||||
|
use pyo3::exceptions;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
use tantivy as tv;
|
||||||
|
|
||||||
|
use crate::document::Document;
|
||||||
|
use crate::query::Query;
|
||||||
|
|
||||||
|
/// Tantivy's Searcher class
|
||||||
|
///
|
||||||
|
/// A Searcher is used to search the index given a prepared Query.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct Searcher {
|
||||||
|
pub(crate) inner: tv::LeasedItem<tv::Searcher>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Searcher {
|
||||||
|
/// Search the index with the given query and collect results.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// query (Query): The query that will be used for the search.
|
||||||
|
/// collector (Collector): A collector that determines how the search
|
||||||
|
/// results will be collected. Only the TopDocs collector is
|
||||||
|
/// supported for now.
|
||||||
|
///
|
||||||
|
/// Returns a list of tuples that contains the scores and DocAddress of the
|
||||||
|
/// search results.
|
||||||
|
///
|
||||||
|
/// Raises a ValueError if there was an error with the search.
|
||||||
|
fn search(
|
||||||
|
&self,
|
||||||
|
query: &Query,
|
||||||
|
collector: &mut TopDocs,
|
||||||
|
) -> PyResult<Vec<(f32, DocAddress)>> {
|
||||||
|
let ret = self.inner.search(&query.inner, &collector.inner);
|
||||||
|
match ret {
|
||||||
|
Ok(r) => {
|
||||||
|
let result: Vec<(f32, DocAddress)> = r
|
||||||
|
.iter()
|
||||||
|
.map(|(f, d)| (f.clone(), DocAddress::from(d)))
|
||||||
|
.collect();
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the overall number of documents in the index.
|
||||||
|
#[getter]
|
||||||
|
fn num_docs(&self) -> u64 {
|
||||||
|
self.inner.num_docs()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetches a document from Tantivy's store given a DocAddress.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// doc_address (DocAddress): The DocAddress that is associated with
|
||||||
|
/// the document that we wish to fetch.
|
||||||
|
///
|
||||||
|
/// Returns the Document, raises ValueError if the document can't be found.
|
||||||
|
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
|
||||||
|
let ret = self.inner.doc(doc_address.into());
|
||||||
|
match ret {
|
||||||
|
Ok(doc) => Ok(Document { inner: doc }),
|
||||||
|
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// DocAddress contains all the necessary information to identify a document
|
||||||
|
/// given a Searcher object.
|
||||||
|
///
|
||||||
|
/// It consists in an id identifying its segment, and its segment-local DocId.
|
||||||
|
/// The id used for the segment is actually an ordinal in the list of segment
|
||||||
|
/// hold by a Searcher.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct DocAddress {
|
||||||
|
pub(crate) segment_ord: tv::SegmentLocalId,
|
||||||
|
pub(crate) doc: tv::DocId,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl DocAddress {
|
||||||
|
/// The segment ordinal is an id identifying the segment hosting the
|
||||||
|
/// document. It is only meaningful, in the context of a searcher.
|
||||||
|
#[getter]
|
||||||
|
fn segment_ord(&self) -> u32 {
|
||||||
|
self.segment_ord
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The segment local DocId
|
||||||
|
#[getter]
|
||||||
|
fn doc(&self) -> u32 {
|
||||||
|
self.doc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&tv::DocAddress> for DocAddress {
|
||||||
|
fn from(doc_address: &tv::DocAddress) -> Self {
|
||||||
|
DocAddress {
|
||||||
|
segment_ord: doc_address.segment_ord(),
|
||||||
|
doc: doc_address.doc(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Into<tv::DocAddress> for &DocAddress {
|
||||||
|
fn into(self) -> tv::DocAddress {
|
||||||
|
tv::DocAddress(self.segment_ord(), self.doc())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The Top Score Collector keeps track of the K documents sorted by their
|
||||||
|
/// score.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// limit (int, optional): The number of documents that the top scorer will
|
||||||
|
/// retrieve. Must be a positive integer larger than 0. Defaults to 10.
|
||||||
|
#[pyclass]
|
||||||
|
pub(crate) struct TopDocs {
|
||||||
|
inner: tv::collector::TopDocs,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl TopDocs {
|
||||||
|
#[new]
|
||||||
|
#[args(limit = 10)]
|
||||||
|
fn new(obj: &PyRawObject, limit: usize) -> PyResult<()> {
|
||||||
|
let top = tv::collector::TopDocs::with_limit(limit);
|
||||||
|
obj.init(TopDocs { inner: top });
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
from .tantivy import *
|
|
@ -0,0 +1,85 @@
|
||||||
|
import json
|
||||||
|
import tantivy
|
||||||
|
|
||||||
|
|
||||||
|
class TestClass(object):
|
||||||
|
def test_simple_search(self):
|
||||||
|
builder = tantivy.SchemaBuilder()
|
||||||
|
|
||||||
|
title = builder.add_text_field("title", stored=True)
|
||||||
|
body = builder.add_text_field("body")
|
||||||
|
|
||||||
|
schema = builder.build()
|
||||||
|
index = tantivy.Index(schema)
|
||||||
|
|
||||||
|
writer = index.writer()
|
||||||
|
|
||||||
|
doc = tantivy.Document()
|
||||||
|
doc.add_text(title, "The Old Man and the Sea")
|
||||||
|
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
||||||
|
"the Gulf Stream and he had gone eighty-four days "
|
||||||
|
"now without taking a fish."))
|
||||||
|
writer.add_document(doc)
|
||||||
|
|
||||||
|
doc = schema.parse_document(json.dumps({
|
||||||
|
"title": "Of Mice and Men",
|
||||||
|
"body": ("A few miles south of Soledad, the Salinas River drops "
|
||||||
|
"in close to the hillside bank and runs deep and "
|
||||||
|
"green. The water is warm too, for it has slipped "
|
||||||
|
"twinkling over the yellow sands in the sunlight "
|
||||||
|
"before reaching the narrow pool. On one side of the "
|
||||||
|
"river the golden foothill slopes curve up to the "
|
||||||
|
"strong and rocky Gabilan Mountains, but on the valley "
|
||||||
|
"side the water is lined with trees—willows fresh and "
|
||||||
|
"green with every spring, carrying in their lower leaf "
|
||||||
|
"junctures the debris of the winter’s flooding; and "
|
||||||
|
"sycamores with mottled, white, recumbent limbs and "
|
||||||
|
"branches that arch over the pool")
|
||||||
|
}))
|
||||||
|
|
||||||
|
writer.add_document(doc)
|
||||||
|
|
||||||
|
doc = schema.parse_document(json.dumps({
|
||||||
|
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||||
|
"body": ("You will rejoice to hear that no disaster has "
|
||||||
|
"accompanied the commencement of an enterprise which you "
|
||||||
|
"have regarded with such evil forebodings. I arrived "
|
||||||
|
"here yesterday, and my first task is to assure my dear "
|
||||||
|
"sister of my welfare and increasing confidence in the "
|
||||||
|
"success of my undertaking.")
|
||||||
|
}))
|
||||||
|
|
||||||
|
writer.add_document(doc)
|
||||||
|
writer.commit()
|
||||||
|
|
||||||
|
reader = index.reader()
|
||||||
|
searcher = reader.searcher()
|
||||||
|
|
||||||
|
query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
||||||
|
query = query_parser.parse_query("sea whale")
|
||||||
|
|
||||||
|
top_docs = tantivy.TopDocs(10)
|
||||||
|
|
||||||
|
result = searcher.search(query, top_docs)
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
assert len(result) == 1
|
||||||
|
|
||||||
|
_, doc_address = result[0]
|
||||||
|
|
||||||
|
searched_doc = searcher.doc(doc_address)
|
||||||
|
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
||||||
|
|
||||||
|
def test_doc(self):
|
||||||
|
builder = tantivy.SchemaBuilder()
|
||||||
|
title = builder.add_text_field("title", stored=True)
|
||||||
|
|
||||||
|
doc = tantivy.Document()
|
||||||
|
assert doc.is_empty
|
||||||
|
|
||||||
|
doc.add_text(title, "The Old Man and the Sea")
|
||||||
|
|
||||||
|
assert doc.get_first(title) == "The Old Man and the Sea"
|
||||||
|
|
||||||
|
assert doc.len == 1
|
||||||
|
assert not doc.is_empty
|
Loading…
Reference in New Issue