Merge pull request #1 from matrix-org/initial-bindings
Initial python bindings implementation.master
commit
9158a4fd7d
|
@ -1,10 +1,7 @@
|
|||
# Generated by Cargo
|
||||
# will have compiled files and executables
|
||||
/target/
|
||||
|
||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||
Cargo.lock
|
||||
|
||||
# These are backup files generated by rustfmt
|
||||
/target
|
||||
**/*.rs.bk
|
||||
Cargo.lock
|
||||
dist/
|
||||
__pycache__/
|
||||
tantivy.so
|
||||
tantivy.egg-info/
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
[package]
|
||||
name = "tantivy-py"
|
||||
version = "0.1.0"
|
||||
authors = ["Damir Jelić <poljar@termina.org.uk>"]
|
||||
edition = "2018"
|
||||
license = "MIT"
|
||||
|
||||
[lib]
|
||||
name = "tantivy"
|
||||
crate-type = ["dylib"]
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4"
|
||||
tantivy = { git = "https://github.com/tantivy-search/tantivy" }
|
||||
|
||||
[dependencies.pyo3]
|
||||
version = "0.7.0"
|
||||
features = ["extension-module"]
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2019 tantivy
|
||||
Copyright (c) 2019 The Matrix.org Foundation CIC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
include Cargo.toml
|
||||
include Makefile
|
||||
include rust-toolchain
|
||||
recursive-include src *
|
|
@ -0,0 +1,17 @@
|
|||
source_files := $(wildcard src/*.rs)
|
||||
|
||||
all: tantivy/tantivy.so
|
||||
|
||||
PHONY: test format
|
||||
|
||||
test: tantivy/tantivy.so
|
||||
python3 -m pytest
|
||||
|
||||
format:
|
||||
rustfmt src/*.rs
|
||||
|
||||
tantivy/tantivy.so: target/debug/libtantivy.so
|
||||
cp target/debug/libtantivy.so tantivy/tantivy.so
|
||||
|
||||
target/debug/libtantivy.so: $(source_files)
|
||||
cargo build
|
|
@ -0,0 +1,54 @@
|
|||
tantivy-py
|
||||
==========
|
||||
|
||||
Python bindings for tantivy.
|
||||
|
||||
|
||||
# Installation
|
||||
|
||||
The bindings can be installed using setuptools:
|
||||
|
||||
python3 setup.py install --user
|
||||
|
||||
Note that this requires setuptools-rust to be installed. Another thing to note
|
||||
is that the bindings are using [PyO3](https://github.com/PyO3/pyo3), which
|
||||
requires rust nightly and only supports python3.
|
||||
|
||||
# Usage
|
||||
|
||||
tantivy-py has a similar API to tantivy. To create a index first a schema
|
||||
needs to be built. After that documents can be added to the index and a reader
|
||||
can be created to search the index.
|
||||
|
||||
```python
|
||||
builder = tantivy.SchemaBuilder()
|
||||
|
||||
title = builder.add_text_field("title", stored=True)
|
||||
body = builder.add_text_field("body")
|
||||
|
||||
schema = builder.build()
|
||||
index = tantivy.Index(schema)
|
||||
|
||||
writer = index.writer()
|
||||
|
||||
doc = tantivy.Document()
|
||||
doc.add_text(title, "The Old Man and the Sea")
|
||||
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."))
|
||||
writer.add_document(doc)
|
||||
|
||||
reader = index.reader()
|
||||
searcher = reader.searcher()
|
||||
|
||||
query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
||||
query = query_parser.parse_query("sea whale")
|
||||
|
||||
top_docs = tantivy.TopDocs(10)
|
||||
result = searcher.search(query, top_docs)
|
||||
|
||||
_, doc_address = result[0]
|
||||
|
||||
searched_doc = searcher.doc(doc_address)
|
||||
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
||||
```
|
|
@ -0,0 +1,2 @@
|
|||
[build-system]
|
||||
requires = ["setuptools", "wheel", "setuptools-rust"]
|
|
@ -0,0 +1 @@
|
|||
nightly-2019-05-22
|
|
@ -0,0 +1 @@
|
|||
max_width = 80
|
|
@ -0,0 +1,15 @@
|
|||
from setuptools import setup
|
||||
|
||||
try:
|
||||
from setuptools_rust import Binding, RustExtension
|
||||
except ImportError:
|
||||
print("Please install setuptools-rust package")
|
||||
raise SystemExit(1)
|
||||
|
||||
setup(
|
||||
name="tantivy",
|
||||
version="0.9.1",
|
||||
rust_extensions=[RustExtension("tantivy.tantivy", binding=Binding.PyO3)],
|
||||
packages=["tantivy"],
|
||||
zip_safe=False,
|
||||
)
|
|
@ -0,0 +1,152 @@
|
|||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyDateTime;
|
||||
use pyo3::types::{PyDateAccess, PyTimeAccess};
|
||||
|
||||
use chrono::offset::TimeZone;
|
||||
use chrono::Utc;
|
||||
|
||||
use tantivy as tv;
|
||||
|
||||
use crate::facet::Facet;
|
||||
use crate::field::{Field, FieldValue};
|
||||
|
||||
/// Tantivy's Document is the object that can be indexed and then searched for.
|
||||
///
|
||||
/// Documents are fundamentally a collection of unordered tuples
|
||||
/// (field, value). In this list, one field may appear more than once.
|
||||
///
|
||||
/// Example:
|
||||
/// >>> doc = tantivy.Document()
|
||||
/// >>> doc.add_text(title, "The Old Man and the Sea")
|
||||
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
|
||||
/// "skiff in the Gulf Stream and he had gone "
|
||||
/// "eighty-four days now without taking a fish."))
|
||||
#[pyclass]
|
||||
pub(crate) struct Document {
|
||||
pub(crate) inner: tv::Document,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Document {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) {
|
||||
obj.init(Document {
|
||||
inner: tv::Document::default(),
|
||||
});
|
||||
}
|
||||
|
||||
/// Add a text value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the text.
|
||||
/// text (str): The text that will be added to the document.
|
||||
fn add_text(&mut self, field: &Field, text: &str) {
|
||||
self.inner.add_text(field.inner, text);
|
||||
}
|
||||
|
||||
/// Add an unsigned integer value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the integer.
|
||||
/// value (int): The integer that will be added to the document.
|
||||
fn add_unsigned(&mut self, field: &Field, value: u64) {
|
||||
self.inner.add_u64(field.inner, value);
|
||||
}
|
||||
|
||||
/// Add a signed integer value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the integer.
|
||||
/// value (int): The integer that will be added to the document.
|
||||
fn add_integer(&mut self, field: &Field, value: i64) {
|
||||
self.inner.add_i64(field.inner, value);
|
||||
}
|
||||
|
||||
/// Add a date value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the integer.
|
||||
/// value (datetime): The date that will be added to the document.
|
||||
fn add_date(&mut self, field: &Field, value: &PyDateTime) {
|
||||
let datetime = Utc
|
||||
.ymd(
|
||||
value.get_year().into(),
|
||||
value.get_month().into(),
|
||||
value.get_day().into(),
|
||||
)
|
||||
.and_hms_micro(
|
||||
value.get_hour().into(),
|
||||
value.get_minute().into(),
|
||||
value.get_second().into(),
|
||||
value.get_microsecond().into(),
|
||||
);
|
||||
|
||||
self.inner.add_date(field.inner, &datetime);
|
||||
}
|
||||
|
||||
/// Add a facet value to the document.
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the facet.
|
||||
/// value (Facet): The Facet that will be added to the document.
|
||||
fn add_facet(&mut self, field: &Field, value: &Facet) {
|
||||
self.inner.add_facet(field.inner, value.inner.clone());
|
||||
}
|
||||
|
||||
/// Add a bytes value to the document.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we are adding the bytes.
|
||||
/// value (bytes): The bytes that will be added to the document.
|
||||
fn add_bytes(&mut self, field: &Field, value: Vec<u8>) {
|
||||
self.inner.add_bytes(field.inner, value);
|
||||
}
|
||||
|
||||
/// Returns the number of added fields that have been added to the document
|
||||
#[getter]
|
||||
fn len(&self) -> usize {
|
||||
self.inner.len()
|
||||
}
|
||||
|
||||
/// True if the document is empty, False otherwise.
|
||||
#[getter]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.inner.is_empty()
|
||||
}
|
||||
|
||||
/// Get the first value associated with the given field.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we would like to get the value.
|
||||
///
|
||||
/// Returns the value if one is found, otherwise None.
|
||||
/// The type of the value depends on the field.
|
||||
fn get_first(&self, py: Python, field: &Field) -> Option<PyObject> {
|
||||
let value = self.inner.get_first(field.inner)?;
|
||||
FieldValue::value_to_py(py, value)
|
||||
}
|
||||
|
||||
/// Get the all values associated with the given field.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which we would like to get the values.
|
||||
///
|
||||
/// Returns a list of values.
|
||||
/// The type of the value depends on the field.
|
||||
fn get_all(&self, py: Python, field: &Field) -> Vec<PyObject> {
|
||||
let values = self.inner.get_all(field.inner);
|
||||
values
|
||||
.iter()
|
||||
.map(|&v| FieldValue::value_to_py(py, v))
|
||||
.filter_map(|x| x)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get all the fields and values contained in the document.
|
||||
fn field_values(&self, py: Python) -> Vec<FieldValue> {
|
||||
let field_values = self.inner.field_values();
|
||||
field_values
|
||||
.iter()
|
||||
.map(|v| FieldValue::field_value_to_py(py, v))
|
||||
.collect()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyType;
|
||||
|
||||
use tantivy::schema;
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
///
|
||||
/// They are typically represented similarly to a filepath. For instance, an
|
||||
/// e-commerce website could have a Facet for /electronics/tv_and_video/led_tv.
|
||||
///
|
||||
/// A document can be associated to any number of facets. The hierarchy
|
||||
/// implicitely imply that a document belonging to a facet also belongs to the
|
||||
/// ancestor of its facet. In the example above, /electronics/tv_and_video/
|
||||
/// and /electronics.
|
||||
#[pyclass]
|
||||
pub(crate) struct Facet {
|
||||
pub(crate) inner: schema::Facet,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Facet {
|
||||
/// Create a new instance of the "root facet" Equivalent to /.
|
||||
#[classmethod]
|
||||
fn root(_cls: &PyType) -> Facet {
|
||||
Facet {
|
||||
inner: schema::Facet::root(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the facet is the root facet /.
|
||||
#[getter]
|
||||
fn is_root(&self) -> bool {
|
||||
self.inner.is_root()
|
||||
}
|
||||
|
||||
/// Returns true if another Facet is a subfacet of this facet.
|
||||
/// Args:
|
||||
/// other (Facet): The Facet that we should check if this facet is a
|
||||
/// subset of.
|
||||
fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||
self.inner.is_prefix_of(&other.inner)
|
||||
}
|
||||
|
||||
/// Create a Facet object from a string.
|
||||
/// Args:
|
||||
/// facet_string (str): The string that contains a facet.
|
||||
///
|
||||
/// Returns the created Facet.
|
||||
#[classmethod]
|
||||
fn from_string(_cls: &PyType, facet_string: &str) -> Facet {
|
||||
Facet {
|
||||
inner: schema::Facet::from_text(facet_string),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyDateTime;
|
||||
|
||||
use tantivy::schema;
|
||||
|
||||
use crate::facet::Facet;
|
||||
|
||||
/// Field is a numeric indentifier that represents an entry in the Schema.
|
||||
#[pyclass]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Field {
|
||||
pub(crate) inner: schema::Field,
|
||||
}
|
||||
|
||||
/// FieldValue holds together a Field and its Value.
|
||||
#[pyclass]
|
||||
pub(crate) struct FieldValue {
|
||||
pub(crate) field: Field,
|
||||
pub(crate) value: PyObject,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl FieldValue {
|
||||
#[getter]
|
||||
fn field(&self) -> Field {
|
||||
self.field.clone()
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn value(&self) -> &PyObject {
|
||||
&self.value
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldValue {
|
||||
pub(crate) fn value_to_py(
|
||||
py: Python,
|
||||
value: &schema::Value,
|
||||
) -> Option<PyObject> {
|
||||
match value {
|
||||
schema::Value::Str(text) => Some(text.into_object(py)),
|
||||
schema::Value::U64(num) => Some(num.into_object(py)),
|
||||
schema::Value::I64(num) => Some(num.into_object(py)),
|
||||
schema::Value::Bytes(b) => Some(b.to_object(py)),
|
||||
schema::Value::Date(d) => {
|
||||
let date =
|
||||
PyDateTime::from_timestamp(py, d.timestamp() as f64, None);
|
||||
|
||||
match date {
|
||||
Ok(d) => Some(d.into_object(py)),
|
||||
Err(_e) => None,
|
||||
}
|
||||
}
|
||||
schema::Value::Facet(f) => {
|
||||
Some(Facet { inner: f.clone() }.into_object(py))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn field_value_to_py(
|
||||
py: Python,
|
||||
field_value: &schema::FieldValue,
|
||||
) -> FieldValue {
|
||||
let value = field_value.value();
|
||||
let field = field_value.field();
|
||||
|
||||
FieldValue {
|
||||
field: Field { inner: field },
|
||||
value: FieldValue::value_to_py(py, value).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,284 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
use crate::document::Document;
|
||||
use crate::schema::Schema;
|
||||
use crate::searcher::Searcher;
|
||||
use tantivy as tv;
|
||||
use tantivy::directory::MmapDirectory;
|
||||
|
||||
const RELOAD_POLICY: &str = "commit";
|
||||
|
||||
/// IndexReader is the entry point to read and search the index.
|
||||
///
|
||||
/// IndexReader controls when a new version of the index should be loaded and
|
||||
/// lends you instances of Searcher for the last loaded version.
|
||||
///
|
||||
/// To create an IndexReader first create an Index and call the reader() method
|
||||
/// on the index object.
|
||||
#[pyclass]
|
||||
pub(crate) struct IndexReader {
|
||||
inner: tv::IndexReader,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl IndexReader {
|
||||
/// Update searchers so that they reflect the state of the last .commit().
|
||||
///
|
||||
/// If you set up the the reload policy to be on 'commit' (which is the
|
||||
/// default) every commit should be rapidly reflected on your IndexReader
|
||||
/// and you should not need to call reload() at all.
|
||||
fn reload(&self) -> PyResult<()> {
|
||||
let ret = self.inner.reload();
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a Searcher for the index.
|
||||
///
|
||||
/// This method should be called every single time a search query is
|
||||
/// performed. The searchers are taken from a pool of num_searchers
|
||||
/// searchers.
|
||||
///
|
||||
/// Returns a Searcher object, if no searcher is available this may block.
|
||||
fn searcher(&self) -> Searcher {
|
||||
let searcher = self.inner.searcher();
|
||||
Searcher { inner: searcher }
|
||||
}
|
||||
}
|
||||
|
||||
/// IndexWriter is the user entry-point to add documents to the index.
|
||||
///
|
||||
/// To create an IndexWriter first create an Index and call the writer() method
|
||||
/// on the index object.
|
||||
#[pyclass]
|
||||
pub(crate) struct IndexWriter {
|
||||
inner: tv::IndexWriter,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl IndexWriter {
|
||||
/// Add a document to the index.
|
||||
///
|
||||
/// If the indexing pipeline is full, this call may block.
|
||||
///
|
||||
/// Returns an `opstamp`, which is an increasing integer that can be used
|
||||
/// by the client to align commits with its own document queue.
|
||||
/// The `opstamp` represents the number of documents that have been added
|
||||
/// since the creation of the index.
|
||||
fn add_document(&mut self, document: &Document) -> PyResult<()> {
|
||||
self.inner.add_document(document.inner.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Commits all of the pending changes
|
||||
///
|
||||
/// A call to commit blocks. After it returns, all of the document that
|
||||
/// were added since the last commit are published and persisted.
|
||||
///
|
||||
/// In case of a crash or an hardware failure (as long as the hard disk is
|
||||
/// spared), it will be possible to resume indexing from this point.
|
||||
///
|
||||
/// Returns the `opstamp` of the last document that made it in the commit.
|
||||
fn commit(&mut self) -> PyResult<()> {
|
||||
let ret = self.inner.commit();
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Rollback to the last commit
|
||||
///
|
||||
/// This cancels all of the update that happened before after the last
|
||||
/// commit. After calling rollback, the index is in the same state as it
|
||||
/// was after the last commit.
|
||||
fn rollback(&mut self) -> PyResult<()> {
|
||||
let ret = self.inner.rollback();
|
||||
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect and removes the files that are not used by the index anymore.
|
||||
fn garbage_collect_files(&mut self) -> PyResult<()> {
|
||||
let ret = self.inner.garbage_collect_files();
|
||||
|
||||
match ret {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// The opstamp of the last successful commit.
|
||||
///
|
||||
/// This is the opstamp the index will rollback to if there is a failure
|
||||
/// like a power surge.
|
||||
///
|
||||
/// This is also the opstamp of the commit that is currently available
|
||||
/// for searchers.
|
||||
#[getter]
|
||||
fn commit_opstamp(&self) -> u64 {
|
||||
self.inner.commit_opstamp()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new index object.
|
||||
///
|
||||
/// Args:
|
||||
/// schema (Schema): The schema of the index.
|
||||
/// path (str, optional): The path where the index should be stored. If
|
||||
/// no path is provided, the index will be stored in memory.
|
||||
/// reuse (bool, optional): Should we open an existing index if one exists
|
||||
/// or always create a new one.
|
||||
///
|
||||
/// If an index already exists it will be opened and reused. Raises OSError
|
||||
/// if there was a problem during the opening or creation of the index.
|
||||
#[pyclass]
|
||||
pub(crate) struct Index {
|
||||
pub(crate) inner: tv::Index,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Index {
|
||||
#[new]
|
||||
#[args(reuse = true)]
|
||||
fn new(
|
||||
obj: &PyRawObject,
|
||||
schema: &Schema,
|
||||
path: Option<&str>,
|
||||
reuse: bool,
|
||||
) -> PyResult<()> {
|
||||
let index = match path {
|
||||
Some(p) => {
|
||||
let directory = MmapDirectory::open(p);
|
||||
|
||||
let dir = match directory {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
return Err(exceptions::OSError::py_err(e.to_string()))
|
||||
}
|
||||
};
|
||||
|
||||
let i = if reuse {
|
||||
tv::Index::open_or_create(dir, schema.inner.clone())
|
||||
} else {
|
||||
tv::Index::create(dir, schema.inner.clone())
|
||||
};
|
||||
|
||||
match i {
|
||||
Ok(index) => index,
|
||||
Err(e) => {
|
||||
return Err(exceptions::OSError::py_err(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
None => tv::Index::create_in_ram(schema.inner.clone()),
|
||||
};
|
||||
|
||||
obj.init(Index { inner: index });
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a `IndexWriter` for the index.
|
||||
///
|
||||
/// The writer will be multithreaded and the provided heap size will be
|
||||
/// split between the given number of threads.
|
||||
///
|
||||
/// Args:
|
||||
/// overall_heap_size (int, optional): The total target memory usage of
|
||||
/// the writer, can't be less than 3000000.
|
||||
/// num_threads (int, optional): The number of threads that the writer
|
||||
/// should use. If this value is 0, tantivy will choose
|
||||
/// automatically the number of threads.
|
||||
///
|
||||
/// Raises ValueError if there was an error while creating the writer.
|
||||
#[args(heap_size = 3000000, num_threads = 0)]
|
||||
fn writer(
|
||||
&self,
|
||||
heap_size: usize,
|
||||
num_threads: usize,
|
||||
) -> PyResult<IndexWriter> {
|
||||
let writer = match num_threads {
|
||||
0 => self.inner.writer(heap_size),
|
||||
_ => self.inner.writer_with_num_threads(num_threads, heap_size),
|
||||
};
|
||||
|
||||
match writer {
|
||||
Ok(w) => Ok(IndexWriter { inner: w }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create an IndexReader for the index.
|
||||
///
|
||||
/// Args:
|
||||
/// reload_policy (str, optional): The reload policy that the
|
||||
/// IndexReader should use. Can be manual or OnCommit.
|
||||
/// num_searchers (int, optional): The number of searchers that the
|
||||
/// reader should create.
|
||||
///
|
||||
/// Returns the IndexReader on success, raises ValueError if a IndexReader
|
||||
/// couldn't be created.
|
||||
#[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
|
||||
fn reader(
|
||||
&self,
|
||||
reload_policy: &str,
|
||||
num_searchers: usize,
|
||||
) -> PyResult<IndexReader> {
|
||||
let reload_policy = reload_policy.to_lowercase();
|
||||
let reload_policy = match reload_policy.as_ref() {
|
||||
"commit" => tv::ReloadPolicy::OnCommit,
|
||||
"on-commit" => tv::ReloadPolicy::OnCommit,
|
||||
"oncommit" => tv::ReloadPolicy::OnCommit,
|
||||
"manual" => tv::ReloadPolicy::Manual,
|
||||
_ => return Err(exceptions::ValueError::py_err(
|
||||
"Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
|
||||
))
|
||||
};
|
||||
|
||||
let builder = self.inner.reader_builder();
|
||||
|
||||
let builder = builder.reload_policy(reload_policy);
|
||||
let builder = if num_searchers > 0 {
|
||||
builder.num_searchers(num_searchers)
|
||||
} else {
|
||||
builder
|
||||
};
|
||||
|
||||
let reader = builder.try_into();
|
||||
match reader {
|
||||
Ok(r) => Ok(IndexReader { inner: r }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the given path contains an existing index.
|
||||
/// Args:
|
||||
/// path: The path where tantivy will search for an index.
|
||||
///
|
||||
/// Returns True if an index exists at the given path, False otherwise.
|
||||
///
|
||||
/// Raises OSError if the directory cannot be opened.
|
||||
#[staticmethod]
|
||||
fn exists(path: &str) -> PyResult<bool> {
|
||||
let directory = MmapDirectory::open(path);
|
||||
let dir = match directory {
|
||||
Ok(d) => d,
|
||||
Err(e) => return Err(exceptions::OSError::py_err(e.to_string())),
|
||||
};
|
||||
|
||||
Ok(tv::Index::exists(&dir))
|
||||
}
|
||||
|
||||
/// The schema of the current index.
|
||||
#[getter]
|
||||
fn schema(&self) -> Schema {
|
||||
let schema = self.inner.schema();
|
||||
Schema { inner: schema }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
use pyo3::prelude::*;
|
||||
|
||||
mod document;
|
||||
mod facet;
|
||||
mod field;
|
||||
mod index;
|
||||
mod query;
|
||||
mod schema;
|
||||
mod schemabuilder;
|
||||
mod searcher;
|
||||
|
||||
use document::Document;
|
||||
use facet::Facet;
|
||||
use field::{Field, FieldValue};
|
||||
use index::Index;
|
||||
use query::QueryParser;
|
||||
use schema::Schema;
|
||||
use schemabuilder::SchemaBuilder;
|
||||
use searcher::{DocAddress, Searcher, TopDocs};
|
||||
|
||||
/// Python bindings for the search engine library Tantivy.
|
||||
///
|
||||
/// Tantivy is a full text search engine library written in rust.
|
||||
///
|
||||
/// It is closer to Apache Lucene than to Elasticsearch and Apache Solr in
|
||||
/// the sense it is not an off-the-shelf search engine server, but rather
|
||||
/// a library that can be used to build such a search engine.
|
||||
/// Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
///
|
||||
/// Example:
|
||||
/// >>> import json
|
||||
/// >>> import tantivy
|
||||
///
|
||||
/// >>> builder = tantivy.SchemaBuilder()
|
||||
///
|
||||
/// >>> title = builder.add_text_field("title", stored=True)
|
||||
/// >>> body = builder.add_text_field("body")
|
||||
///
|
||||
/// >>> schema = builder.build()
|
||||
/// >>> index = tantivy.Index(schema)
|
||||
/// >>> doc = tantivy.Document()
|
||||
/// >>> doc.add_text(title, "The Old Man and the Sea")
|
||||
/// >>> doc.add_text(body, ("He was an old man who fished alone in a "
|
||||
/// "skiff in the Gulf Stream and he had gone "
|
||||
/// "eighty-four days now without taking a fish."))
|
||||
///
|
||||
/// >>> writer.add_document(doc)
|
||||
///
|
||||
/// >>> doc = schema.parse_document(json.dumps({
|
||||
/// "title": ["Frankenstein", "The Modern Prometheus"],
|
||||
/// "body": ("You will rejoice to hear that no disaster has "
|
||||
/// "accompanied the commencement of an enterprise which "
|
||||
/// "you have regarded with such evil forebodings. "
|
||||
/// "I arrived here yesterday, and my first task is to "
|
||||
/// "assure my dear sister of my welfare and increasing "
|
||||
/// "confidence in the success of my undertaking.")
|
||||
/// }))
|
||||
///
|
||||
/// >>> writer.add_document(doc)
|
||||
/// >>> writer.commit()
|
||||
///
|
||||
/// >>> reader = index.reader()
|
||||
/// >>> searcher = reader.searcher()
|
||||
///
|
||||
/// >>> query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
||||
/// >>> query = query_parser.parse_query("sea whale")
|
||||
///
|
||||
/// >>> top_docs = tantivy.TopDocs.with_limit(10)
|
||||
/// >>> result = searcher.search(query, top_docs)
|
||||
///
|
||||
/// >>> assert len(result) == 1
|
||||
///
|
||||
#[pymodule]
|
||||
fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<Schema>()?;
|
||||
m.add_class::<SchemaBuilder>()?;
|
||||
m.add_class::<Searcher>()?;
|
||||
m.add_class::<Index>()?;
|
||||
m.add_class::<QueryParser>()?;
|
||||
m.add_class::<Document>()?;
|
||||
m.add_class::<DocAddress>()?;
|
||||
m.add_class::<TopDocs>()?;
|
||||
m.add_class::<Field>()?;
|
||||
m.add_class::<FieldValue>()?;
|
||||
m.add_class::<Facet>()?;
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::PyType;
|
||||
|
||||
use tantivy as tv;
|
||||
|
||||
use crate::field::Field;
|
||||
use crate::index::Index;
|
||||
|
||||
/// Tantivy's Query
|
||||
#[pyclass]
|
||||
pub(crate) struct Query {
|
||||
pub(crate) inner: Box<dyn tv::query::Query>,
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
#[pyclass]
|
||||
pub(crate) struct QueryParser {
|
||||
inner: tv::query::QueryParser,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl QueryParser {
|
||||
/// Creates a QueryParser for an Index.
|
||||
///
|
||||
/// Args:
|
||||
/// index (Index): The index for which the query will be created.
|
||||
/// default_fields (List[Field]): A list of fields used to search if no
|
||||
/// field is specified in the query.
|
||||
///
|
||||
/// Returns the QueryParser.
|
||||
#[classmethod]
|
||||
fn for_index(
|
||||
_cls: &PyType,
|
||||
index: &Index,
|
||||
default_fields: Vec<&Field>,
|
||||
) -> PyResult<QueryParser> {
|
||||
let default_fields: Vec<tv::schema::Field> =
|
||||
default_fields.iter().map(|&f| f.inner.clone()).collect();
|
||||
|
||||
let parser =
|
||||
tv::query::QueryParser::for_index(&index.inner, default_fields);
|
||||
Ok(QueryParser { inner: parser })
|
||||
}
|
||||
|
||||
/// Parse a string into a query that can be given to a searcher.
|
||||
///
|
||||
/// Args:
|
||||
/// query (str): A query string that should be parsed into a query.
|
||||
///
|
||||
/// Returns the parsed Query object. Raises ValueError if there was an
|
||||
/// error with the query string.
|
||||
fn parse_query(&self, query: &str) -> PyResult<Query> {
|
||||
let ret = self.inner.parse_query(query);
|
||||
|
||||
match ret {
|
||||
Ok(q) => Ok(Query { inner: q }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the default way to compose queries to a conjunction.
|
||||
///
|
||||
/// By default, the query happy tax payer is equivalent to the query happy
|
||||
/// OR tax OR payer. After calling .set_conjunction_by_default() happy tax
|
||||
/// payer will be interpreted by the parser as happy AND tax AND payer.
|
||||
fn set_conjunction_by_default(&mut self) {
|
||||
self.inner.set_conjunction_by_default();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
use tantivy::schema;
|
||||
|
||||
use crate::document::Document;
|
||||
use crate::field::Field;
|
||||
|
||||
/// Tantivy schema.
|
||||
///
|
||||
/// The schema is very strict. To build the schema the `SchemaBuilder` class is
|
||||
/// provided.
|
||||
#[pyclass]
|
||||
pub(crate) struct Schema {
|
||||
pub(crate) inner: schema::Schema,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Schema {
|
||||
/// Build a document object from a json string.
|
||||
///
|
||||
/// Args:
|
||||
/// doc_json (str) - A string containing json that should be parsed
|
||||
/// into a `Document`
|
||||
///
|
||||
/// Returns the parsed document, raises a ValueError if the parsing failed.
|
||||
fn parse_document(&self, doc_json: &str) -> PyResult<Document> {
|
||||
let ret = self.inner.parse_document(doc_json);
|
||||
match ret {
|
||||
Ok(d) => Ok(Document { inner: d }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a `Document` object into a json string.
|
||||
///
|
||||
/// Args:
|
||||
/// doc (Document): The document that will be converted into a json
|
||||
/// string.
|
||||
fn to_json(&self, doc: &Document) -> String {
|
||||
self.inner.to_json(&doc.inner)
|
||||
}
|
||||
|
||||
/// Return the field name for a given `Field`.
|
||||
///
|
||||
/// Args:
|
||||
/// field (Field): The field for which the name will be returned.
|
||||
fn get_field_name(&self, field: &Field) -> &str {
|
||||
self.inner.get_field_name(field.inner)
|
||||
}
|
||||
|
||||
/// Returns the field option associated with a given name.
|
||||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field that we want to retrieve.
|
||||
///
|
||||
/// Returns the Field if one is found, None otherwise.
|
||||
fn get_field(&self, name: &str) -> Option<Field> {
|
||||
let f = self.inner.get_field(name);
|
||||
match f {
|
||||
Some(field) => Some(Field { inner: field }),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,318 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
use tantivy::schema;
|
||||
|
||||
use crate::field::Field;
|
||||
use crate::schema::Schema;
|
||||
|
||||
/// Tantivy has a very strict schema.
|
||||
/// You need to specify in advance whether a field is indexed or not,
|
||||
/// stored or not.
|
||||
///
|
||||
/// This is done by creating a schema object, and
|
||||
/// setting up the fields one by one.
|
||||
///
|
||||
/// Examples:
|
||||
///
|
||||
/// >>> builder = tantivy.SchemaBuilder()
|
||||
///
|
||||
/// >>> title = builder.add_text_field("title", stored=True)
|
||||
/// >>> body = builder.add_text_field("body")
|
||||
///
|
||||
/// >>> schema = builder.build()
|
||||
#[pyclass]
|
||||
pub(crate) struct SchemaBuilder {
|
||||
pub(crate) builder: Option<schema::SchemaBuilder>,
|
||||
}
|
||||
|
||||
const TOKENIZER: &str = "default";
|
||||
const RECORD: &str = "position";
|
||||
|
||||
#[pymethods]
|
||||
impl SchemaBuilder {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) {
|
||||
obj.init(SchemaBuilder {
|
||||
builder: Some(schema::Schema::builder()),
|
||||
});
|
||||
}
|
||||
|
||||
/// Add a new text field to the schema.
|
||||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
/// stored (bool, optional): If true sets the field as stored, the
|
||||
/// content of the field can be later restored from a Searcher.
|
||||
/// Defaults to False.
|
||||
/// tokenizer_name (str, optional): The name of the tokenizer that
|
||||
/// should be used to process the field. Defaults to 'default'
|
||||
/// index_option (str, optional): Sets which information should be
|
||||
/// indexed with the tokens. Can be one of 'position', 'freq' or
|
||||
/// 'basic'. Defaults to 'position'. The 'basic' index_option
|
||||
/// records only the document ID, the 'freq' option records the
|
||||
/// document id and the term frequency, while the 'position' option
|
||||
/// records the document id, term frequency and the positions of
|
||||
/// the term occurrences in the document.
|
||||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(
|
||||
stored = false,
|
||||
tokenizer_name = "TOKENIZER",
|
||||
index_option = "RECORD"
|
||||
)]
|
||||
fn add_text_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
stored: bool,
|
||||
tokenizer_name: &str,
|
||||
index_option: &str,
|
||||
) -> PyResult<Field> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let index_option = match index_option {
|
||||
"position" => schema::IndexRecordOption::WithFreqsAndPositions,
|
||||
"freq" => schema::IndexRecordOption::WithFreqs,
|
||||
"basic" => schema::IndexRecordOption::Basic,
|
||||
_ => return Err(exceptions::ValueError::py_err(
|
||||
"Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
|
||||
))
|
||||
};
|
||||
|
||||
let indexing = schema::TextFieldIndexing::default()
|
||||
.set_tokenizer(tokenizer_name)
|
||||
.set_index_option(index_option);
|
||||
|
||||
let options =
|
||||
schema::TextOptions::default().set_indexing_options(indexing);
|
||||
let options = if stored {
|
||||
options.set_stored()
|
||||
} else {
|
||||
options
|
||||
};
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_text_field(name, options);
|
||||
Ok(Field { inner: field })
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a new signed integer field to the schema.
|
||||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
/// stored (bool, optional): If true sets the field as stored, the
|
||||
/// content of the field can be later restored from a Searcher.
|
||||
/// Defaults to False.
|
||||
/// indexed (bool, optional): If true sets the field to be indexed.
|
||||
/// fast (str, optional): Set the u64 options as a single-valued fast
|
||||
/// field. Fast fields are designed for random access. Access time
|
||||
/// are similar to a random lookup in an array. If more than one
|
||||
/// value is associated to a fast field, only the last one is kept.
|
||||
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
||||
/// the document must have exactly one value associated to the
|
||||
/// document. If this is set to 'multi', the document can have any
|
||||
/// number of values associated to the document. Defaults to None,
|
||||
/// which disables this option.
|
||||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(stored = false, indexed = false)]
|
||||
fn add_integer_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
stored: bool,
|
||||
indexed: bool,
|
||||
fast: Option<&str>,
|
||||
) -> PyResult<Field> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_i64_field(name, opts);
|
||||
Ok(Field { inner: field })
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a new unsigned integer field to the schema.
|
||||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
/// stored (bool, optional): If true sets the field as stored, the
|
||||
/// content of the field can be later restored from a Searcher.
|
||||
/// Defaults to False.
|
||||
/// indexed (bool, optional): If true sets the field to be indexed.
|
||||
/// fast (str, optional): Set the u64 options as a single-valued fast
|
||||
/// field. Fast fields are designed for random access. Access time
|
||||
/// are similar to a random lookup in an array. If more than one
|
||||
/// value is associated to a fast field, only the last one is kept.
|
||||
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
||||
/// the document must have exactly one value associated to the
|
||||
/// document. If this is set to 'multi', the document can have any
|
||||
/// number of values associated to the document. Defaults to None,
|
||||
/// which disables this option.
|
||||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(stored = false, indexed = false)]
|
||||
fn add_unsigned_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
stored: bool,
|
||||
indexed: bool,
|
||||
fast: Option<&str>,
|
||||
) -> PyResult<Field> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_u64_field(name, opts);
|
||||
Ok(Field { inner: field })
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a new date field to the schema.
|
||||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
/// stored (bool, optional): If true sets the field as stored, the
|
||||
/// content of the field can be later restored from a Searcher.
|
||||
/// Defaults to False.
|
||||
/// indexed (bool, optional): If true sets the field to be indexed.
|
||||
/// fast (str, optional): Set the u64 options as a single-valued fast
|
||||
/// field. Fast fields are designed for random access. Access time
|
||||
/// are similar to a random lookup in an array. If more than one
|
||||
/// value is associated to a fast field, only the last one is kept.
|
||||
/// Can be one of 'single' or 'multi'. If this is set to 'single,
|
||||
/// the document must have exactly one value associated to the
|
||||
/// document. If this is set to 'multi', the document can have any
|
||||
/// number of values associated to the document. Defaults to None,
|
||||
/// which disables this option.
|
||||
///
|
||||
/// Returns the associated field handle.
|
||||
/// Raises a ValueError if there was an error with the field creation.
|
||||
#[args(stored = false, indexed = false)]
|
||||
fn add_date_field(
|
||||
&mut self,
|
||||
name: &str,
|
||||
stored: bool,
|
||||
indexed: bool,
|
||||
fast: Option<&str>,
|
||||
) -> PyResult<Field> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_date_field(name, opts);
|
||||
Ok(Field { inner: field })
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a Facet field to the schema.
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
fn add_facet_field(&mut self, name: &str) -> PyResult<Field> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_facet_field(name);
|
||||
Ok(Field { inner: field })
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a fast bytes field to the schema.
|
||||
///
|
||||
/// Bytes field are not searchable and are only used
|
||||
/// as fast field, to associate any kind of payload
|
||||
/// to a document.
|
||||
///
|
||||
/// Args:
|
||||
/// name (str): The name of the field.
|
||||
fn add_bytes_field(&mut self, name: &str) -> PyResult<Field> {
|
||||
let builder = &mut self.builder;
|
||||
|
||||
if let Some(builder) = builder {
|
||||
let field = builder.add_bytes_field(name);
|
||||
Ok(Field { inner: field })
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Finalize the creation of a Schema.
|
||||
///
|
||||
/// Returns a Schema object. After this is called the SchemaBuilder cannot
|
||||
/// be used anymore.
|
||||
fn build(&mut self) -> PyResult<Schema> {
|
||||
let builder = self.builder.take();
|
||||
if let Some(builder) = builder {
|
||||
let schema = builder.build();
|
||||
Ok(Schema { inner: schema })
|
||||
} else {
|
||||
Err(exceptions::ValueError::py_err(
|
||||
"Schema builder object isn't valid anymore.",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SchemaBuilder {
|
||||
fn build_int_option(
|
||||
stored: bool,
|
||||
indexed: bool,
|
||||
fast: Option<&str>,
|
||||
) -> PyResult<schema::IntOptions> {
|
||||
let opts = schema::IntOptions::default();
|
||||
|
||||
let opts = if stored { opts.set_stored() } else { opts };
|
||||
let opts = if indexed { opts.set_indexed() } else { opts };
|
||||
|
||||
let fast = match fast {
|
||||
Some(f) => {
|
||||
let f = f.to_lowercase();
|
||||
match f.as_ref() {
|
||||
"single" => Some(schema::Cardinality::SingleValue),
|
||||
"multi" => Some(schema::Cardinality::MultiValues),
|
||||
_ => return Err(exceptions::ValueError::py_err(
|
||||
"Invalid index option, valid choices are: 'multivalue' and 'singlevalue'"
|
||||
)),
|
||||
}
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let opts = if let Some(f) = fast {
|
||||
opts.set_fast(f)
|
||||
} else {
|
||||
opts
|
||||
};
|
||||
|
||||
Ok(opts)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
|
||||
use tantivy as tv;
|
||||
|
||||
use crate::document::Document;
|
||||
use crate::query::Query;
|
||||
|
||||
/// Tantivy's Searcher class
|
||||
///
|
||||
/// A Searcher is used to search the index given a prepared Query.
|
||||
#[pyclass]
|
||||
pub(crate) struct Searcher {
|
||||
pub(crate) inner: tv::LeasedItem<tv::Searcher>,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Searcher {
|
||||
/// Search the index with the given query and collect results.
|
||||
///
|
||||
/// Args:
|
||||
/// query (Query): The query that will be used for the search.
|
||||
/// collector (Collector): A collector that determines how the search
|
||||
/// results will be collected. Only the TopDocs collector is
|
||||
/// supported for now.
|
||||
///
|
||||
/// Returns a list of tuples that contains the scores and DocAddress of the
|
||||
/// search results.
|
||||
///
|
||||
/// Raises a ValueError if there was an error with the search.
|
||||
fn search(
|
||||
&self,
|
||||
query: &Query,
|
||||
collector: &mut TopDocs,
|
||||
) -> PyResult<Vec<(f32, DocAddress)>> {
|
||||
let ret = self.inner.search(&query.inner, &collector.inner);
|
||||
match ret {
|
||||
Ok(r) => {
|
||||
let result: Vec<(f32, DocAddress)> = r
|
||||
.iter()
|
||||
.map(|(f, d)| (f.clone(), DocAddress::from(d)))
|
||||
.collect();
|
||||
Ok(result)
|
||||
}
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
#[getter]
|
||||
fn num_docs(&self) -> u64 {
|
||||
self.inner.num_docs()
|
||||
}
|
||||
|
||||
/// Fetches a document from Tantivy's store given a DocAddress.
|
||||
///
|
||||
/// Args:
|
||||
/// doc_address (DocAddress): The DocAddress that is associated with
|
||||
/// the document that we wish to fetch.
|
||||
///
|
||||
/// Returns the Document, raises ValueError if the document can't be found.
|
||||
fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
|
||||
let ret = self.inner.doc(doc_address.into());
|
||||
match ret {
|
||||
Ok(doc) => Ok(Document { inner: doc }),
|
||||
Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// DocAddress contains all the necessary information to identify a document
|
||||
/// given a Searcher object.
|
||||
///
|
||||
/// It consists in an id identifying its segment, and its segment-local DocId.
|
||||
/// The id used for the segment is actually an ordinal in the list of segment
|
||||
/// hold by a Searcher.
|
||||
#[pyclass]
|
||||
pub(crate) struct DocAddress {
|
||||
pub(crate) segment_ord: tv::SegmentLocalId,
|
||||
pub(crate) doc: tv::DocId,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl DocAddress {
|
||||
/// The segment ordinal is an id identifying the segment hosting the
|
||||
/// document. It is only meaningful, in the context of a searcher.
|
||||
#[getter]
|
||||
fn segment_ord(&self) -> u32 {
|
||||
self.segment_ord
|
||||
}
|
||||
|
||||
/// The segment local DocId
|
||||
#[getter]
|
||||
fn doc(&self) -> u32 {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&tv::DocAddress> for DocAddress {
|
||||
fn from(doc_address: &tv::DocAddress) -> Self {
|
||||
DocAddress {
|
||||
segment_ord: doc_address.segment_ord(),
|
||||
doc: doc_address.doc(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Into<tv::DocAddress> for &DocAddress {
|
||||
fn into(self) -> tv::DocAddress {
|
||||
tv::DocAddress(self.segment_ord(), self.doc())
|
||||
}
|
||||
}
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents sorted by their
|
||||
/// score.
|
||||
///
|
||||
/// Args:
|
||||
/// limit (int, optional): The number of documents that the top scorer will
|
||||
/// retrieve. Must be a positive integer larger than 0. Defaults to 10.
|
||||
#[pyclass]
|
||||
pub(crate) struct TopDocs {
|
||||
inner: tv::collector::TopDocs,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl TopDocs {
|
||||
#[new]
|
||||
#[args(limit = 10)]
|
||||
fn new(obj: &PyRawObject, limit: usize) -> PyResult<()> {
|
||||
let top = tv::collector::TopDocs::with_limit(limit);
|
||||
obj.init(TopDocs { inner: top });
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
from .tantivy import *
|
|
@ -0,0 +1,85 @@
|
|||
import json
|
||||
import tantivy
|
||||
|
||||
|
||||
class TestClass(object):
|
||||
def test_simple_search(self):
|
||||
builder = tantivy.SchemaBuilder()
|
||||
|
||||
title = builder.add_text_field("title", stored=True)
|
||||
body = builder.add_text_field("body")
|
||||
|
||||
schema = builder.build()
|
||||
index = tantivy.Index(schema)
|
||||
|
||||
writer = index.writer()
|
||||
|
||||
doc = tantivy.Document()
|
||||
doc.add_text(title, "The Old Man and the Sea")
|
||||
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."))
|
||||
writer.add_document(doc)
|
||||
|
||||
doc = schema.parse_document(json.dumps({
|
||||
"title": "Of Mice and Men",
|
||||
"body": ("A few miles south of Soledad, the Salinas River drops "
|
||||
"in close to the hillside bank and runs deep and "
|
||||
"green. The water is warm too, for it has slipped "
|
||||
"twinkling over the yellow sands in the sunlight "
|
||||
"before reaching the narrow pool. On one side of the "
|
||||
"river the golden foothill slopes curve up to the "
|
||||
"strong and rocky Gabilan Mountains, but on the valley "
|
||||
"side the water is lined with trees—willows fresh and "
|
||||
"green with every spring, carrying in their lower leaf "
|
||||
"junctures the debris of the winter’s flooding; and "
|
||||
"sycamores with mottled, white, recumbent limbs and "
|
||||
"branches that arch over the pool")
|
||||
}))
|
||||
|
||||
writer.add_document(doc)
|
||||
|
||||
doc = schema.parse_document(json.dumps({
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": ("You will rejoice to hear that no disaster has "
|
||||
"accompanied the commencement of an enterprise which you "
|
||||
"have regarded with such evil forebodings. I arrived "
|
||||
"here yesterday, and my first task is to assure my dear "
|
||||
"sister of my welfare and increasing confidence in the "
|
||||
"success of my undertaking.")
|
||||
}))
|
||||
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
|
||||
reader = index.reader()
|
||||
searcher = reader.searcher()
|
||||
|
||||
query_parser = tantivy.QueryParser.for_index(index, [title, body])
|
||||
query = query_parser.parse_query("sea whale")
|
||||
|
||||
top_docs = tantivy.TopDocs(10)
|
||||
|
||||
result = searcher.search(query, top_docs)
|
||||
print(result)
|
||||
|
||||
assert len(result) == 1
|
||||
|
||||
_, doc_address = result[0]
|
||||
|
||||
searched_doc = searcher.doc(doc_address)
|
||||
assert searched_doc.get_first(title) == "The Old Man and the Sea"
|
||||
|
||||
def test_doc(self):
|
||||
builder = tantivy.SchemaBuilder()
|
||||
title = builder.add_text_field("title", stored=True)
|
||||
|
||||
doc = tantivy.Document()
|
||||
assert doc.is_empty
|
||||
|
||||
doc.add_text(title, "The Old Man and the Sea")
|
||||
|
||||
assert doc.get_first(title) == "The Old Man and the Sea"
|
||||
|
||||
assert doc.len == 1
|
||||
assert not doc.is_empty
|
Loading…
Reference in New Issue