From 5ea790518bd483fcb9effb5c32e3a0d99afc308d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Jeli=C4=87?= Date: Tue, 4 Jun 2019 11:09:58 +0200 Subject: [PATCH] Initial python bindings implementation. --- .gitignore | 15 +- Cargo.toml | 18 +++ LICENSE | 2 +- MANIFEST.in | 4 + Makefile | 17 +++ README.md | 54 +++++++ pyproject.toml | 2 + rust-toolchain | 1 + rustfmt.toml | 1 + setup.py | 15 ++ src/document.rs | 152 ++++++++++++++++++++ src/facet.rs | 55 ++++++++ src/field.rs | 72 ++++++++++ src/index.rs | 284 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 88 ++++++++++++ src/query.rs | 70 ++++++++++ src/schema.rs | 65 +++++++++ src/schemabuilder.rs | 318 ++++++++++++++++++++++++++++++++++++++++++ src/searcher.rs | 134 ++++++++++++++++++ tantivy/__init__.py | 1 + tests/tantivy_test.py | 85 +++++++++++ 21 files changed, 1443 insertions(+), 10 deletions(-) create mode 100644 Cargo.toml create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 README.md create mode 100644 pyproject.toml create mode 100644 rust-toolchain create mode 100644 rustfmt.toml create mode 100644 setup.py create mode 100644 src/document.rs create mode 100644 src/facet.rs create mode 100644 src/field.rs create mode 100644 src/index.rs create mode 100644 src/lib.rs create mode 100644 src/query.rs create mode 100644 src/schema.rs create mode 100644 src/schemabuilder.rs create mode 100644 src/searcher.rs create mode 100644 tantivy/__init__.py create mode 100644 tests/tantivy_test.py diff --git a/.gitignore b/.gitignore index 088ba6b..9d8e9e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,7 @@ -# Generated by Cargo -# will have compiled files and executables -/target/ - -# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries -# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html -Cargo.lock - -# These are backup files generated by rustfmt +/target **/*.rs.bk +Cargo.lock +dist/ +__pycache__/ +tantivy.so +tantivy.egg-info/ diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0d3001d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "tantivy-python" +version = "0.1.0" +authors = ["Damir Jelić "] +edition = "2018" +license = "MIT" + +[lib] +name = "tantivy" +crate-type = ["dylib"] + +[dependencies] +chrono = "0.4" +tantivy = { git = "https://github.com/tantivy-search/tantivy" } + +[dependencies.pyo3] +version = "0.7.0" +features = ["extension-module"] diff --git a/LICENSE b/LICENSE index 39bf28e..559931a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 tantivy +Copyright (c) 2019 The Matrix.org Foundation CIC Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c1b1cae --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include Cargo.toml +include Makefile +include rust-toolchain +recursive-include src * diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e2bd18f --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +source_files := $(wildcard src/*.rs) + +all: tantivy/tantivy.so + +PHONY: test format + +test: tantivy/tantivy.so + python3 -m pytest + +format: + rustfmt src/*.rs + +tantivy/tantivy.so: target/debug/libtantivy.so + cp target/debug/libtantivy.so tantivy/tantivy.so + +target/debug/libtantivy.so: $(source_files) + cargo build diff --git a/README.md b/README.md new file mode 100644 index 0000000..6738bec --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +python-tantivy +============== + +Python bindings for tantivy. + + +# Installation + +The bindings can be installed using setuptools: + + python setup.py install --user + +Note that this requires setuptools-rust to be installed. Another thing to note +is that the bindings are using [PyO3](https://github.com/PyO3/pyo3), which +requires rust nightly currently. + +# Usage + +python-tantivy has a similar API to tantivy. To create a index first a schema +needs to be built. After that documents can be added to the index and a reader +can be created to search the index. + +```python + builder = tantivy.SchemaBuilder() + + title = builder.add_text_field("title", stored=True) + body = builder.add_text_field("body") + + schema = builder.build() + index = tantivy.Index(schema) + + writer = index.writer() + + doc = tantivy.Document() + doc.add_text(title, "The Old Man and the Sea") + doc.add_text(body, ("He was an old man who fished alone in a skiff in" + "the Gulf Stream and he had gone eighty-four days " + "now without taking a fish.")) + writer.add_document(doc) + + reader = index.reader() + searcher = reader.searcher() + + query_parser = tantivy.QueryParser.for_index(index, [title, body]) + query = query_parser.parse_query("sea whale") + + top_docs = tantivy.TopDocs(10) + result = searcher.search(query, top_docs) + + _, doc_address = result[0] + + searched_doc = searcher.doc(doc_address) + assert searched_doc.get_first(title) == "The Old Man and the Sea" +``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..31ffe04 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[build-system] +requires = ["setuptools", "wheel", "setuptools-rust"] diff --git a/rust-toolchain b/rust-toolchain new file mode 100644 index 0000000..f677ace --- /dev/null +++ b/rust-toolchain @@ -0,0 +1 @@ +nightly-2019-05-22 diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..df99c69 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +max_width = 80 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..91ddbe2 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup + +try: + from setuptools_rust import Binding, RustExtension +except ImportError: + print("Please install setuptools-rust package") + raise SystemExit(1) + +setup( + name="tantivy", + version="0.9.1", + rust_extensions=[RustExtension("tantivy.tantivy", binding=Binding.PyO3)], + packages=["tantivy"], + zip_safe=False, +) diff --git a/src/document.rs b/src/document.rs new file mode 100644 index 0000000..2eaa007 --- /dev/null +++ b/src/document.rs @@ -0,0 +1,152 @@ +use pyo3::prelude::*; +use pyo3::types::PyDateTime; +use pyo3::types::{PyDateAccess, PyTimeAccess}; + +use chrono::offset::TimeZone; +use chrono::Utc; + +use tantivy as tv; + +use crate::facet::Facet; +use crate::field::{Field, FieldValue}; + +/// Tantivy's Document is the object that can be indexed and then searched for. +/// +/// Documents are fundamentally a collection of unordered tuples +/// (field, value). In this list, one field may appear more than once. +/// +/// Example: +/// >>> doc = tantivy.Document() +/// >>> doc.add_text(title, "The Old Man and the Sea") +/// >>> doc.add_text(body, ("He was an old man who fished alone in a " +/// "skiff in the Gulf Stream and he had gone " +/// "eighty-four days now without taking a fish.")) +#[pyclass] +pub(crate) struct Document { + pub(crate) inner: tv::Document, +} + +#[pymethods] +impl Document { + #[new] + fn new(obj: &PyRawObject) { + obj.init(Document { + inner: tv::Document::default(), + }); + } + + /// Add a text value to the document. + /// + /// Args: + /// field (Field): The field for which we are adding the text. + /// text (str): The text that will be added to the document. + fn add_text(&mut self, field: &Field, text: &str) { + self.inner.add_text(field.inner, text); + } + + /// Add an unsigned integer value to the document. + /// + /// Args: + /// field (Field): The field for which we are adding the integer. + /// value (int): The integer that will be added to the document. + fn add_unsigned(&mut self, field: &Field, value: u64) { + self.inner.add_u64(field.inner, value); + } + + /// Add a signed integer value to the document. + /// + /// Args: + /// field (Field): The field for which we are adding the integer. + /// value (int): The integer that will be added to the document. + fn add_integer(&mut self, field: &Field, value: i64) { + self.inner.add_i64(field.inner, value); + } + + /// Add a date value to the document. + /// + /// Args: + /// field (Field): The field for which we are adding the integer. + /// value (datetime): The date that will be added to the document. + fn add_date(&mut self, field: &Field, value: &PyDateTime) { + let datetime = Utc + .ymd( + value.get_year().into(), + value.get_month().into(), + value.get_day().into(), + ) + .and_hms_micro( + value.get_hour().into(), + value.get_minute().into(), + value.get_second().into(), + value.get_microsecond().into(), + ); + + self.inner.add_date(field.inner, &datetime); + } + + /// Add a facet value to the document. + /// Args: + /// field (Field): The field for which we are adding the facet. + /// value (Facet): The Facet that will be added to the document. + fn add_facet(&mut self, field: &Field, value: &Facet) { + self.inner.add_facet(field.inner, value.inner.clone()); + } + + /// Add a bytes value to the document. + /// + /// Args: + /// field (Field): The field for which we are adding the bytes. + /// value (bytes): The bytes that will be added to the document. + fn add_bytes(&mut self, field: &Field, value: Vec) { + self.inner.add_bytes(field.inner, value); + } + + /// Returns the number of added fields that have been added to the document + #[getter] + fn len(&self) -> usize { + self.inner.len() + } + + /// True if the document is empty, False otherwise. + #[getter] + fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Get the first value associated with the given field. + /// + /// Args: + /// field (Field): The field for which we would like to get the value. + /// + /// Returns the value if one is found, otherwise None. + /// The type of the value depends on the field. + fn get_first(&self, py: Python, field: &Field) -> Option { + let value = self.inner.get_first(field.inner)?; + FieldValue::value_to_py(py, value) + } + + /// Get the all values associated with the given field. + /// + /// Args: + /// field (Field): The field for which we would like to get the values. + /// + /// Returns a list of values. + /// The type of the value depends on the field. + fn get_all(&self, py: Python, field: &Field) -> Vec { + let values = self.inner.get_all(field.inner); + values + .iter() + .map(|&v| FieldValue::value_to_py(py, v)) + .filter_map(|x| x) + .collect() + } + + /// Get all the fields and values contained in the document. + fn field_values(&self, py: Python) -> Vec { + let field_values = self.inner.field_values(); + field_values + .iter() + .map(|v| FieldValue::field_value_to_py(py, v)) + .collect() + } +} diff --git a/src/facet.rs b/src/facet.rs new file mode 100644 index 0000000..de7e930 --- /dev/null +++ b/src/facet.rs @@ -0,0 +1,55 @@ +use pyo3::prelude::*; +use pyo3::types::PyType; + +use tantivy::schema; + +/// A Facet represent a point in a given hierarchy. +/// +/// They are typically represented similarly to a filepath. For instance, an +/// e-commerce website could have a Facet for /electronics/tv_and_video/led_tv. +/// +/// A document can be associated to any number of facets. The hierarchy +/// implicitely imply that a document belonging to a facet also belongs to the +/// ancestor of its facet. In the example above, /electronics/tv_and_video/ +/// and /electronics. +#[pyclass] +pub(crate) struct Facet { + pub(crate) inner: schema::Facet, +} + +#[pymethods] +impl Facet { + /// Create a new instance of the "root facet" Equivalent to /. + #[classmethod] + fn root(_cls: &PyType) -> Facet { + Facet { + inner: schema::Facet::root(), + } + } + + /// Returns true if the facet is the root facet /. + #[getter] + fn is_root(&self) -> bool { + self.inner.is_root() + } + + /// Returns true if another Facet is a subfacet of this facet. + /// Args: + /// other (Facet): The Facet that we should check if this facet is a + /// subset of. + fn is_prefix_of(&self, other: &Facet) -> bool { + self.inner.is_prefix_of(&other.inner) + } + + /// Create a Facet object from a string. + /// Args: + /// facet_string (str): The string that contains a facet. + /// + /// Returns the created Facet. + #[classmethod] + fn from_string(_cls: &PyType, facet_string: &str) -> Facet { + Facet { + inner: schema::Facet::from_text(facet_string), + } + } +} diff --git a/src/field.rs b/src/field.rs new file mode 100644 index 0000000..8a7f2e3 --- /dev/null +++ b/src/field.rs @@ -0,0 +1,72 @@ +use pyo3::prelude::*; +use pyo3::types::PyDateTime; + +use tantivy::schema; + +use crate::facet::Facet; + +/// Field is a numeric indentifier that represents an entry in the Schema. +#[pyclass] +#[derive(Clone)] +pub(crate) struct Field { + pub(crate) inner: schema::Field, +} + +/// FieldValue holds together a Field and its Value. +#[pyclass] +pub(crate) struct FieldValue { + pub(crate) field: Field, + pub(crate) value: PyObject, +} + +#[pymethods] +impl FieldValue { + #[getter] + fn field(&self) -> Field { + self.field.clone() + } + + #[getter] + fn value(&self) -> &PyObject { + &self.value + } +} + +impl FieldValue { + pub(crate) fn value_to_py( + py: Python, + value: &schema::Value, + ) -> Option { + match value { + schema::Value::Str(text) => Some(text.into_object(py)), + schema::Value::U64(num) => Some(num.into_object(py)), + schema::Value::I64(num) => Some(num.into_object(py)), + schema::Value::Bytes(b) => Some(b.to_object(py)), + schema::Value::Date(d) => { + let date = + PyDateTime::from_timestamp(py, d.timestamp() as f64, None); + + match date { + Ok(d) => Some(d.into_object(py)), + Err(_e) => None, + } + } + schema::Value::Facet(f) => { + Some(Facet { inner: f.clone() }.into_object(py)) + } + } + } + + pub(crate) fn field_value_to_py( + py: Python, + field_value: &schema::FieldValue, + ) -> FieldValue { + let value = field_value.value(); + let field = field_value.field(); + + FieldValue { + field: Field { inner: field }, + value: FieldValue::value_to_py(py, value).unwrap(), + } + } +} diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 0000000..fdc90b0 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,284 @@ +use pyo3::exceptions; +use pyo3::prelude::*; + +use crate::document::Document; +use crate::schema::Schema; +use crate::searcher::Searcher; +use tantivy as tv; +use tantivy::directory::MmapDirectory; + +const RELOAD_POLICY: &str = "commit"; + +/// IndexReader is the entry point to read and search the index. +/// +/// IndexReader controls when a new version of the index should be loaded and +/// lends you instances of Searcher for the last loaded version. +/// +/// To create an IndexReader first create an Index and call the reader() method +/// on the index object. +#[pyclass] +pub(crate) struct IndexReader { + inner: tv::IndexReader, +} + +#[pymethods] +impl IndexReader { + /// Update searchers so that they reflect the state of the last .commit(). + /// + /// If you set up the the reload policy to be on 'commit' (which is the + /// default) every commit should be rapidly reflected on your IndexReader + /// and you should not need to call reload() at all. + fn reload(&self) -> PyResult<()> { + let ret = self.inner.reload(); + match ret { + Ok(_) => Ok(()), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Get a Searcher for the index. + /// + /// This method should be called every single time a search query is + /// performed. The searchers are taken from a pool of num_searchers + /// searchers. + /// + /// Returns a Searcher object, if no searcher is available this may block. + fn searcher(&self) -> Searcher { + let searcher = self.inner.searcher(); + Searcher { inner: searcher } + } +} + +/// IndexWriter is the user entry-point to add documents to the index. +/// +/// To create an IndexWriter first create an Index and call the writer() method +/// on the index object. +#[pyclass] +pub(crate) struct IndexWriter { + inner: tv::IndexWriter, +} + +#[pymethods] +impl IndexWriter { + /// Add a document to the index. + /// + /// If the indexing pipeline is full, this call may block. + /// + /// Returns an `opstamp`, which is an increasing integer that can be used + /// by the client to align commits with its own document queue. + /// The `opstamp` represents the number of documents that have been added + /// since the creation of the index. + fn add_document(&mut self, document: &Document) -> PyResult<()> { + self.inner.add_document(document.inner.clone()); + Ok(()) + } + + /// Commits all of the pending changes + /// + /// A call to commit blocks. After it returns, all of the document that + /// were added since the last commit are published and persisted. + /// + /// In case of a crash or an hardware failure (as long as the hard disk is + /// spared), it will be possible to resume indexing from this point. + /// + /// Returns the `opstamp` of the last document that made it in the commit. + fn commit(&mut self) -> PyResult<()> { + let ret = self.inner.commit(); + match ret { + Ok(_) => Ok(()), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Rollback to the last commit + /// + /// This cancels all of the update that happened before after the last + /// commit. After calling rollback, the index is in the same state as it + /// was after the last commit. + fn rollback(&mut self) -> PyResult<()> { + let ret = self.inner.rollback(); + + match ret { + Ok(_) => Ok(()), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Detect and removes the files that are not used by the index anymore. + fn garbage_collect_files(&mut self) -> PyResult<()> { + let ret = self.inner.garbage_collect_files(); + + match ret { + Ok(_) => Ok(()), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// The opstamp of the last successful commit. + /// + /// This is the opstamp the index will rollback to if there is a failure + /// like a power surge. + /// + /// This is also the opstamp of the commit that is currently available + /// for searchers. + #[getter] + fn commit_opstamp(&self) -> u64 { + self.inner.commit_opstamp() + } +} + +/// Create a new index object. +/// +/// Args: +/// schema (Schema): The schema of the index. +/// path (str, optional): The path where the index should be stored. If +/// no path is provided, the index will be stored in memory. +/// reuse (bool, optional): Should we open an existing index if one exists +/// or always create a new one. +/// +/// If an index already exists it will be opened and reused. Raises OSError +/// if there was a problem during the opening or creation of the index. +#[pyclass] +pub(crate) struct Index { + pub(crate) inner: tv::Index, +} + +#[pymethods] +impl Index { + #[new] + #[args(reuse = true)] + fn new( + obj: &PyRawObject, + schema: &Schema, + path: Option<&str>, + reuse: bool, + ) -> PyResult<()> { + let index = match path { + Some(p) => { + let directory = MmapDirectory::open(p); + + let dir = match directory { + Ok(d) => d, + Err(e) => { + return Err(exceptions::OSError::py_err(e.to_string())) + } + }; + + let i = if reuse { + tv::Index::open_or_create(dir, schema.inner.clone()) + } else { + tv::Index::create(dir, schema.inner.clone()) + }; + + match i { + Ok(index) => index, + Err(e) => { + return Err(exceptions::OSError::py_err(e.to_string())) + } + } + } + None => tv::Index::create_in_ram(schema.inner.clone()), + }; + + obj.init(Index { inner: index }); + Ok(()) + } + + /// Create a `IndexWriter` for the index. + /// + /// The writer will be multithreaded and the provided heap size will be + /// split between the given number of threads. + /// + /// Args: + /// overall_heap_size (int, optional): The total target memory usage of + /// the writer, can't be less than 3000000. + /// num_threads (int, optional): The number of threads that the writer + /// should use. If this value is 0, tantivy will choose + /// automatically the number of threads. + /// + /// Raises ValueError if there was an error while creating the writer. + #[args(heap_size = 3000000, num_threads = 0)] + fn writer( + &self, + heap_size: usize, + num_threads: usize, + ) -> PyResult { + let writer = match num_threads { + 0 => self.inner.writer(heap_size), + _ => self.inner.writer_with_num_threads(num_threads, heap_size), + }; + + match writer { + Ok(w) => Ok(IndexWriter { inner: w }), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Create an IndexReader for the index. + /// + /// Args: + /// reload_policy (str, optional): The reload policy that the + /// IndexReader should use. Can be manual or OnCommit. + /// num_searchers (int, optional): The number of searchers that the + /// reader should create. + /// + /// Returns the IndexReader on success, raises ValueError if a IndexReader + /// couldn't be created. + #[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)] + fn reader( + &self, + reload_policy: &str, + num_searchers: usize, + ) -> PyResult { + let reload_policy = reload_policy.to_lowercase(); + let reload_policy = match reload_policy.as_ref() { + "commit" => tv::ReloadPolicy::OnCommit, + "on-commit" => tv::ReloadPolicy::OnCommit, + "oncommit" => tv::ReloadPolicy::OnCommit, + "manual" => tv::ReloadPolicy::Manual, + _ => return Err(exceptions::ValueError::py_err( + "Invalid reload policy, valid choices are: 'manual' and 'OnCommit'" + )) + }; + + let builder = self.inner.reader_builder(); + + let builder = builder.reload_policy(reload_policy); + let builder = if num_searchers > 0 { + builder.num_searchers(num_searchers) + } else { + builder + }; + + let reader = builder.try_into(); + match reader { + Ok(r) => Ok(IndexReader { inner: r }), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Check if the given path contains an existing index. + /// Args: + /// path: The path where tantivy will search for an index. + /// + /// Returns True if an index exists at the given path, False otherwise. + /// + /// Raises OSError if the directory cannot be opened. + #[staticmethod] + fn exists(path: &str) -> PyResult { + let directory = MmapDirectory::open(path); + let dir = match directory { + Ok(d) => d, + Err(e) => return Err(exceptions::OSError::py_err(e.to_string())), + }; + + Ok(tv::Index::exists(&dir)) + } + + /// The schema of the current index. + #[getter] + fn schema(&self) -> Schema { + let schema = self.inner.schema(); + Schema { inner: schema } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..697c40b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,88 @@ +use pyo3::prelude::*; + +mod document; +mod facet; +mod field; +mod index; +mod query; +mod schema; +mod schemabuilder; +mod searcher; + +use document::Document; +use facet::Facet; +use field::{Field, FieldValue}; +use index::Index; +use query::QueryParser; +use schema::Schema; +use schemabuilder::SchemaBuilder; +use searcher::{DocAddress, Searcher, TopDocs}; + +/// Python bindings for the search engine library Tantivy. +/// +/// Tantivy is a full text search engine library written in rust. +/// +/// It is closer to Apache Lucene than to Elasticsearch and Apache Solr in +/// the sense it is not an off-the-shelf search engine server, but rather +/// a library that can be used to build such a search engine. +/// Tantivy is, in fact, strongly inspired by Lucene's design. +/// +/// Example: +/// >>> import json +/// >>> import tantivy +/// +/// >>> builder = tantivy.SchemaBuilder() +/// +/// >>> title = builder.add_text_field("title", stored=True) +/// >>> body = builder.add_text_field("body") +/// +/// >>> schema = builder.build() +/// >>> index = tantivy.Index(schema) +/// >>> doc = tantivy.Document() +/// >>> doc.add_text(title, "The Old Man and the Sea") +/// >>> doc.add_text(body, ("He was an old man who fished alone in a " +/// "skiff in the Gulf Stream and he had gone " +/// "eighty-four days now without taking a fish.")) +/// +/// >>> writer.add_document(doc) +/// +/// >>> doc = schema.parse_document(json.dumps({ +/// "title": ["Frankenstein", "The Modern Prometheus"], +/// "body": ("You will rejoice to hear that no disaster has " +/// "accompanied the commencement of an enterprise which " +/// "you have regarded with such evil forebodings. " +/// "I arrived here yesterday, and my first task is to " +/// "assure my dear sister of my welfare and increasing " +/// "confidence in the success of my undertaking.") +/// })) +/// +/// >>> writer.add_document(doc) +/// >>> writer.commit() +/// +/// >>> reader = index.reader() +/// >>> searcher = reader.searcher() +/// +/// >>> query_parser = tantivy.QueryParser.for_index(index, [title, body]) +/// >>> query = query_parser.parse_query("sea whale") +/// +/// >>> top_docs = tantivy.TopDocs.with_limit(10) +/// >>> result = searcher.search(query, top_docs) +/// +/// >>> assert len(result) == 1 +/// +#[pymodule] +fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + Ok(()) +} diff --git a/src/query.rs b/src/query.rs new file mode 100644 index 0000000..89a5ff2 --- /dev/null +++ b/src/query.rs @@ -0,0 +1,70 @@ +use pyo3::exceptions; +use pyo3::prelude::*; +use pyo3::types::PyType; + +use tantivy as tv; + +use crate::field::Field; +use crate::index::Index; + +/// Tantivy's Query +#[pyclass] +pub(crate) struct Query { + pub(crate) inner: Box, +} + +/// Tantivy's Query parser +#[pyclass] +pub(crate) struct QueryParser { + inner: tv::query::QueryParser, +} + +#[pymethods] +impl QueryParser { + /// Creates a QueryParser for an Index. + /// + /// Args: + /// index (Index): The index for which the query will be created. + /// default_fields (List[Field]): A list of fields used to search if no + /// field is specified in the query. + /// + /// Returns the QueryParser. + #[classmethod] + fn for_index( + _cls: &PyType, + index: &Index, + default_fields: Vec<&Field>, + ) -> PyResult { + let default_fields: Vec = + default_fields.iter().map(|&f| f.inner.clone()).collect(); + + let parser = + tv::query::QueryParser::for_index(&index.inner, default_fields); + Ok(QueryParser { inner: parser }) + } + + /// Parse a string into a query that can be given to a searcher. + /// + /// Args: + /// query (str): A query string that should be parsed into a query. + /// + /// Returns the parsed Query object. Raises ValueError if there was an + /// error with the query string. + fn parse_query(&self, query: &str) -> PyResult { + let ret = self.inner.parse_query(query); + + match ret { + Ok(q) => Ok(Query { inner: q }), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Set the default way to compose queries to a conjunction. + /// + /// By default, the query happy tax payer is equivalent to the query happy + /// OR tax OR payer. After calling .set_conjunction_by_default() happy tax + /// payer will be interpreted by the parser as happy AND tax AND payer. + fn set_conjunction_by_default(&mut self) { + self.inner.set_conjunction_by_default(); + } +} diff --git a/src/schema.rs b/src/schema.rs new file mode 100644 index 0000000..5ced6cf --- /dev/null +++ b/src/schema.rs @@ -0,0 +1,65 @@ +use pyo3::exceptions; +use pyo3::prelude::*; + +use tantivy::schema; + +use crate::document::Document; +use crate::field::Field; + +/// Tantivy schema. +/// +/// The schema is very strict. To build the schema the `SchemaBuilder` class is +/// provided. +#[pyclass] +pub(crate) struct Schema { + pub(crate) inner: schema::Schema, +} + +#[pymethods] +impl Schema { + /// Build a document object from a json string. + /// + /// Args: + /// doc_json (str) - A string containing json that should be parsed + /// into a `Document` + /// + /// Returns the parsed document, raises a ValueError if the parsing failed. + fn parse_document(&self, doc_json: &str) -> PyResult { + let ret = self.inner.parse_document(doc_json); + match ret { + Ok(d) => Ok(Document { inner: d }), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Convert a `Document` object into a json string. + /// + /// Args: + /// doc (Document): The document that will be converted into a json + /// string. + fn to_json(&self, doc: &Document) -> String { + self.inner.to_json(&doc.inner) + } + + /// Return the field name for a given `Field`. + /// + /// Args: + /// field (Field): The field for which the name will be returned. + fn get_field_name(&self, field: &Field) -> &str { + self.inner.get_field_name(field.inner) + } + + /// Returns the field option associated with a given name. + /// + /// Args: + /// name (str): The name of the field that we want to retrieve. + /// + /// Returns the Field if one is found, None otherwise. + fn get_field(&self, name: &str) -> Option { + let f = self.inner.get_field(name); + match f { + Some(field) => Some(Field { inner: field }), + None => None, + } + } +} diff --git a/src/schemabuilder.rs b/src/schemabuilder.rs new file mode 100644 index 0000000..b915259 --- /dev/null +++ b/src/schemabuilder.rs @@ -0,0 +1,318 @@ +use pyo3::exceptions; +use pyo3::prelude::*; + +use tantivy::schema; + +use crate::field::Field; +use crate::schema::Schema; + +/// Tantivy has a very strict schema. +/// You need to specify in advance whether a field is indexed or not, +/// stored or not. +/// +/// This is done by creating a schema object, and +/// setting up the fields one by one. +/// +/// Examples: +/// +/// >>> builder = tantivy.SchemaBuilder() +/// +/// >>> title = builder.add_text_field("title", stored=True) +/// >>> body = builder.add_text_field("body") +/// +/// >>> schema = builder.build() +#[pyclass] +pub(crate) struct SchemaBuilder { + pub(crate) builder: Option, +} + +const TOKENIZER: &str = "default"; +const RECORD: &str = "position"; + +#[pymethods] +impl SchemaBuilder { + #[new] + fn new(obj: &PyRawObject) { + obj.init(SchemaBuilder { + builder: Some(schema::Schema::builder()), + }); + } + + /// Add a new text field to the schema. + /// + /// Args: + /// name (str): The name of the field. + /// stored (bool, optional): If true sets the field as stored, the + /// content of the field can be later restored from a Searcher. + /// Defaults to False. + /// tokenizer_name (str, optional): The name of the tokenizer that + /// should be used to process the field. Defaults to 'default' + /// index_option (str, optional): Sets which information should be + /// indexed with the tokens. Can be one of 'position', 'freq' or + /// 'basic'. Defaults to 'position'. The 'basic' index_option + /// records only the document ID, the 'freq' option records the + /// document id and the term frequency, while the 'position' option + /// records the document id, term frequency and the positions of + /// the term occurrences in the document. + /// + /// Returns the associated field handle. + /// Raises a ValueError if there was an error with the field creation. + #[args( + stored = false, + tokenizer_name = "TOKENIZER", + index_option = "RECORD" + )] + fn add_text_field( + &mut self, + name: &str, + stored: bool, + tokenizer_name: &str, + index_option: &str, + ) -> PyResult { + let builder = &mut self.builder; + + let index_option = match index_option { + "position" => schema::IndexRecordOption::WithFreqsAndPositions, + "freq" => schema::IndexRecordOption::WithFreqs, + "basic" => schema::IndexRecordOption::Basic, + _ => return Err(exceptions::ValueError::py_err( + "Invalid index option, valid choices are: 'basic', 'freq' and 'position'" + )) + }; + + let indexing = schema::TextFieldIndexing::default() + .set_tokenizer(tokenizer_name) + .set_index_option(index_option); + + let options = + schema::TextOptions::default().set_indexing_options(indexing); + let options = if stored { + options.set_stored() + } else { + options + }; + + if let Some(builder) = builder { + let field = builder.add_text_field(name, options); + Ok(Field { inner: field }) + } else { + Err(exceptions::ValueError::py_err( + "Schema builder object isn't valid anymore.", + )) + } + } + + /// Add a new signed integer field to the schema. + /// + /// Args: + /// name (str): The name of the field. + /// stored (bool, optional): If true sets the field as stored, the + /// content of the field can be later restored from a Searcher. + /// Defaults to False. + /// indexed (bool, optional): If true sets the field to be indexed. + /// fast (str, optional): Set the u64 options as a single-valued fast + /// field. Fast fields are designed for random access. Access time + /// are similar to a random lookup in an array. If more than one + /// value is associated to a fast field, only the last one is kept. + /// Can be one of 'single' or 'multi'. If this is set to 'single, + /// the document must have exactly one value associated to the + /// document. If this is set to 'multi', the document can have any + /// number of values associated to the document. Defaults to None, + /// which disables this option. + /// + /// Returns the associated field handle. + /// Raises a ValueError if there was an error with the field creation. + #[args(stored = false, indexed = false)] + fn add_integer_field( + &mut self, + name: &str, + stored: bool, + indexed: bool, + fast: Option<&str>, + ) -> PyResult { + let builder = &mut self.builder; + + let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; + + if let Some(builder) = builder { + let field = builder.add_i64_field(name, opts); + Ok(Field { inner: field }) + } else { + Err(exceptions::ValueError::py_err( + "Schema builder object isn't valid anymore.", + )) + } + } + + /// Add a new unsigned integer field to the schema. + /// + /// Args: + /// name (str): The name of the field. + /// stored (bool, optional): If true sets the field as stored, the + /// content of the field can be later restored from a Searcher. + /// Defaults to False. + /// indexed (bool, optional): If true sets the field to be indexed. + /// fast (str, optional): Set the u64 options as a single-valued fast + /// field. Fast fields are designed for random access. Access time + /// are similar to a random lookup in an array. If more than one + /// value is associated to a fast field, only the last one is kept. + /// Can be one of 'single' or 'multi'. If this is set to 'single, + /// the document must have exactly one value associated to the + /// document. If this is set to 'multi', the document can have any + /// number of values associated to the document. Defaults to None, + /// which disables this option. + /// + /// Returns the associated field handle. + /// Raises a ValueError if there was an error with the field creation. + #[args(stored = false, indexed = false)] + fn add_unsigned_field( + &mut self, + name: &str, + stored: bool, + indexed: bool, + fast: Option<&str>, + ) -> PyResult { + let builder = &mut self.builder; + + let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; + + if let Some(builder) = builder { + let field = builder.add_u64_field(name, opts); + Ok(Field { inner: field }) + } else { + Err(exceptions::ValueError::py_err( + "Schema builder object isn't valid anymore.", + )) + } + } + + /// Add a new date field to the schema. + /// + /// Args: + /// name (str): The name of the field. + /// stored (bool, optional): If true sets the field as stored, the + /// content of the field can be later restored from a Searcher. + /// Defaults to False. + /// indexed (bool, optional): If true sets the field to be indexed. + /// fast (str, optional): Set the u64 options as a single-valued fast + /// field. Fast fields are designed for random access. Access time + /// are similar to a random lookup in an array. If more than one + /// value is associated to a fast field, only the last one is kept. + /// Can be one of 'single' or 'multi'. If this is set to 'single, + /// the document must have exactly one value associated to the + /// document. If this is set to 'multi', the document can have any + /// number of values associated to the document. Defaults to None, + /// which disables this option. + /// + /// Returns the associated field handle. + /// Raises a ValueError if there was an error with the field creation. + #[args(stored = false, indexed = false)] + fn add_date_field( + &mut self, + name: &str, + stored: bool, + indexed: bool, + fast: Option<&str>, + ) -> PyResult { + let builder = &mut self.builder; + + let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?; + + if let Some(builder) = builder { + let field = builder.add_date_field(name, opts); + Ok(Field { inner: field }) + } else { + Err(exceptions::ValueError::py_err( + "Schema builder object isn't valid anymore.", + )) + } + } + + /// Add a Facet field to the schema. + /// Args: + /// name (str): The name of the field. + fn add_facet_field(&mut self, name: &str) -> PyResult { + let builder = &mut self.builder; + + if let Some(builder) = builder { + let field = builder.add_facet_field(name); + Ok(Field { inner: field }) + } else { + Err(exceptions::ValueError::py_err( + "Schema builder object isn't valid anymore.", + )) + } + } + + /// Add a fast bytes field to the schema. + /// + /// Bytes field are not searchable and are only used + /// as fast field, to associate any kind of payload + /// to a document. + /// + /// Args: + /// name (str): The name of the field. + fn add_bytes_field(&mut self, name: &str) -> PyResult { + let builder = &mut self.builder; + + if let Some(builder) = builder { + let field = builder.add_bytes_field(name); + Ok(Field { inner: field }) + } else { + Err(exceptions::ValueError::py_err( + "Schema builder object isn't valid anymore.", + )) + } + } + + /// Finalize the creation of a Schema. + /// + /// Returns a Schema object. After this is called the SchemaBuilder cannot + /// be used anymore. + fn build(&mut self) -> PyResult { + let builder = self.builder.take(); + if let Some(builder) = builder { + let schema = builder.build(); + Ok(Schema { inner: schema }) + } else { + Err(exceptions::ValueError::py_err( + "Schema builder object isn't valid anymore.", + )) + } + } +} + +impl SchemaBuilder { + fn build_int_option( + stored: bool, + indexed: bool, + fast: Option<&str>, + ) -> PyResult { + let opts = schema::IntOptions::default(); + + let opts = if stored { opts.set_stored() } else { opts }; + let opts = if indexed { opts.set_indexed() } else { opts }; + + let fast = match fast { + Some(f) => { + let f = f.to_lowercase(); + match f.as_ref() { + "single" => Some(schema::Cardinality::SingleValue), + "multi" => Some(schema::Cardinality::MultiValues), + _ => return Err(exceptions::ValueError::py_err( + "Invalid index option, valid choices are: 'multivalue' and 'singlevalue'" + )), + } + } + None => None, + }; + + let opts = if let Some(f) = fast { + opts.set_fast(f) + } else { + opts + }; + + Ok(opts) + } +} diff --git a/src/searcher.rs b/src/searcher.rs new file mode 100644 index 0000000..777cd43 --- /dev/null +++ b/src/searcher.rs @@ -0,0 +1,134 @@ +use pyo3::exceptions; +use pyo3::prelude::*; + +use tantivy as tv; + +use crate::document::Document; +use crate::query::Query; + +/// Tantivy's Searcher class +/// +/// A Searcher is used to search the index given a prepared Query. +#[pyclass] +pub(crate) struct Searcher { + pub(crate) inner: tv::LeasedItem, +} + +#[pymethods] +impl Searcher { + /// Search the index with the given query and collect results. + /// + /// Args: + /// query (Query): The query that will be used for the search. + /// collector (Collector): A collector that determines how the search + /// results will be collected. Only the TopDocs collector is + /// supported for now. + /// + /// Returns a list of tuples that contains the scores and DocAddress of the + /// search results. + /// + /// Raises a ValueError if there was an error with the search. + fn search( + &self, + query: &Query, + collector: &mut TopDocs, + ) -> PyResult> { + let ret = self.inner.search(&query.inner, &collector.inner); + match ret { + Ok(r) => { + let result: Vec<(f32, DocAddress)> = r + .iter() + .map(|(f, d)| (f.clone(), DocAddress::from(d))) + .collect(); + Ok(result) + } + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } + + /// Returns the overall number of documents in the index. + #[getter] + fn num_docs(&self) -> u64 { + self.inner.num_docs() + } + + /// Fetches a document from Tantivy's store given a DocAddress. + /// + /// Args: + /// doc_address (DocAddress): The DocAddress that is associated with + /// the document that we wish to fetch. + /// + /// Returns the Document, raises ValueError if the document can't be found. + fn doc(&self, doc_address: &DocAddress) -> PyResult { + let ret = self.inner.doc(doc_address.into()); + match ret { + Ok(doc) => Ok(Document { inner: doc }), + Err(e) => Err(exceptions::ValueError::py_err(e.to_string())), + } + } +} + +/// DocAddress contains all the necessary information to identify a document +/// given a Searcher object. +/// +/// It consists in an id identifying its segment, and its segment-local DocId. +/// The id used for the segment is actually an ordinal in the list of segment +/// hold by a Searcher. +#[pyclass] +pub(crate) struct DocAddress { + pub(crate) segment_ord: tv::SegmentLocalId, + pub(crate) doc: tv::DocId, +} + +#[pymethods] +impl DocAddress { + /// The segment ordinal is an id identifying the segment hosting the + /// document. It is only meaningful, in the context of a searcher. + #[getter] + fn segment_ord(&self) -> u32 { + self.segment_ord + } + + /// The segment local DocId + #[getter] + fn doc(&self) -> u32 { + self.doc + } +} + +impl From<&tv::DocAddress> for DocAddress { + fn from(doc_address: &tv::DocAddress) -> Self { + DocAddress { + segment_ord: doc_address.segment_ord(), + doc: doc_address.doc(), + } + } +} + +impl Into for &DocAddress { + fn into(self) -> tv::DocAddress { + tv::DocAddress(self.segment_ord(), self.doc()) + } +} + +/// The Top Score Collector keeps track of the K documents sorted by their +/// score. +/// +/// Args: +/// limit (int, optional): The number of documents that the top scorer will +/// retrieve. Must be a positive integer larger than 0. Defaults to 10. +#[pyclass] +pub(crate) struct TopDocs { + inner: tv::collector::TopDocs, +} + +#[pymethods] +impl TopDocs { + #[new] + #[args(limit = 10)] + fn new(obj: &PyRawObject, limit: usize) -> PyResult<()> { + let top = tv::collector::TopDocs::with_limit(limit); + obj.init(TopDocs { inner: top }); + Ok(()) + } +} diff --git a/tantivy/__init__.py b/tantivy/__init__.py new file mode 100644 index 0000000..b462ae2 --- /dev/null +++ b/tantivy/__init__.py @@ -0,0 +1 @@ +from .tantivy import * diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py new file mode 100644 index 0000000..5a72026 --- /dev/null +++ b/tests/tantivy_test.py @@ -0,0 +1,85 @@ +import json +import tantivy + + +class TestClass(object): + def test_simple_search(self): + builder = tantivy.SchemaBuilder() + + title = builder.add_text_field("title", stored=True) + body = builder.add_text_field("body") + + schema = builder.build() + index = tantivy.Index(schema) + + writer = index.writer() + + doc = tantivy.Document() + doc.add_text(title, "The Old Man and the Sea") + doc.add_text(body, ("He was an old man who fished alone in a skiff in" + "the Gulf Stream and he had gone eighty-four days " + "now without taking a fish.")) + writer.add_document(doc) + + doc = schema.parse_document(json.dumps({ + "title": "Of Mice and Men", + "body": ("A few miles south of Soledad, the Salinas River drops " + "in close to the hillside bank and runs deep and " + "green. The water is warm too, for it has slipped " + "twinkling over the yellow sands in the sunlight " + "before reaching the narrow pool. On one side of the " + "river the golden foothill slopes curve up to the " + "strong and rocky Gabilan Mountains, but on the valley " + "side the water is lined with trees—willows fresh and " + "green with every spring, carrying in their lower leaf " + "junctures the debris of the winter’s flooding; and " + "sycamores with mottled, white, recumbent limbs and " + "branches that arch over the pool") + })) + + writer.add_document(doc) + + doc = schema.parse_document(json.dumps({ + "title": ["Frankenstein", "The Modern Prometheus"], + "body": ("You will rejoice to hear that no disaster has " + "accompanied the commencement of an enterprise which you " + "have regarded with such evil forebodings. I arrived " + "here yesterday, and my first task is to assure my dear " + "sister of my welfare and increasing confidence in the " + "success of my undertaking.") + })) + + writer.add_document(doc) + writer.commit() + + reader = index.reader() + searcher = reader.searcher() + + query_parser = tantivy.QueryParser.for_index(index, [title, body]) + query = query_parser.parse_query("sea whale") + + top_docs = tantivy.TopDocs(10) + + result = searcher.search(query, top_docs) + print(result) + + assert len(result) == 1 + + _, doc_address = result[0] + + searched_doc = searcher.doc(doc_address) + assert searched_doc.get_first(title) == "The Old Man and the Sea" + + def test_doc(self): + builder = tantivy.SchemaBuilder() + title = builder.add_text_field("title", stored=True) + + doc = tantivy.Document() + assert doc.is_empty + + doc.add_text(title, "The Old Man and the Sea") + + assert doc.get_first(title) == "The Old Man and the Sea" + + assert doc.len == 1 + assert not doc.is_empty