Initial python bindings implementation.

2019-06-04 11:09:58 +02:00 · 2019-06-04 11:09:58 +02:00 · 5ea790518b
commit 5ea790518b
parent 5f4e1ef253
21 changed files with 1443 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,7 @@
-# Generated by Cargo
-# will have compiled files and executables
-/target/
-
-# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
-# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
-Cargo.lock
-
-# These are backup files generated by rustfmt
+/target
 **/*.rs.bk
+Cargo.lock
+dist/
+__pycache__/
+tantivy.so
+tantivy.egg-info/
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,18 @@
+[package]
+name = "tantivy-python"
+version = "0.1.0"
+authors = ["Damir Jelić <poljar@termina.org.uk>"]
+edition = "2018"
+license = "MIT"
+
+[lib]
+name = "tantivy"
+crate-type = ["dylib"]
+
+[dependencies]
+chrono = "0.4"
+tantivy = { git = "https://github.com/tantivy-search/tantivy" }
+
+[dependencies.pyo3]
+version = "0.7.0"
+features = ["extension-module"]
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2019 tantivy
+Copyright (c) 2019 The Matrix.org Foundation CIC

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,4 @@
+include Cargo.toml
+include Makefile
+include rust-toolchain
+recursive-include src *
--- a/17
+++ b/17
@ -0,0 +1,17 @@
+source_files := $(wildcard src/*.rs)
+
+all: tantivy/tantivy.so
+
+PHONY: test format
+
+test: tantivy/tantivy.so
+	python3 -m pytest
+
+format:
+	rustfmt src/*.rs
+
+tantivy/tantivy.so: target/debug/libtantivy.so
+	cp target/debug/libtantivy.so tantivy/tantivy.so
+
+target/debug/libtantivy.so: $(source_files)
+	cargo build
--- a/README.md
+++ b/README.md
@ -0,0 +1,54 @@
+python-tantivy
+==============
+
+Python bindings for tantivy.
+
+
+# Installation
+
+The bindings can be installed using setuptools:
+
+    python setup.py install --user
+
+Note that this requires setuptools-rust to be installed. Another thing to note
+is that the bindings are using [PyO3](https://github.com/PyO3/pyo3), which
+requires rust nightly currently.
+
+# Usage
+
+python-tantivy has a similar API to tantivy. To create a index first a schema
+needs to be built. After that documents can be added to the index and a reader
+can be created to search the index.
+
+```python
+    builder = tantivy.SchemaBuilder()
+
+    title = builder.add_text_field("title", stored=True)
+    body = builder.add_text_field("body")
+
+    schema = builder.build()
+    index = tantivy.Index(schema)
+
+    writer = index.writer()
+
+    doc = tantivy.Document()
+    doc.add_text(title, "The Old Man and the Sea")
+    doc.add_text(body, ("He was an old man who fished alone in a skiff in"
+                        "the Gulf Stream and he had gone eighty-four days "
+                        "now without taking a fish."))
+    writer.add_document(doc)
+
+    reader = index.reader()
+    searcher = reader.searcher()
+
+    query_parser = tantivy.QueryParser.for_index(index, [title, body])
+    query = query_parser.parse_query("sea whale")
+
+    top_docs = tantivy.TopDocs(10)
+    result = searcher.search(query, top_docs)
+
+    _, doc_address = result[0]
+
+    searched_doc = searcher.doc(doc_address)
+    assert searched_doc.get_first(title) == "The Old Man and the Sea"
+```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,2 @@
+[build-system]
+requires = ["setuptools", "wheel", "setuptools-rust"]
--- a/1
+++ b/1
@ -0,0 +1 @@
+nightly-2019-05-22
--- a/rustfmt.toml
+++ b/rustfmt.toml
@ -0,0 +1 @@
+max_width = 80
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,15 @@
+from setuptools import setup
+
+try:
+    from setuptools_rust import Binding, RustExtension
+except ImportError:
+    print("Please install setuptools-rust package")
+    raise SystemExit(1)
+
+setup(
+    name="tantivy",
+    version="0.9.1",
+    rust_extensions=[RustExtension("tantivy.tantivy", binding=Binding.PyO3)],
+    packages=["tantivy"],
+    zip_safe=False,
+)
--- a/src/document.rs
+++ b/src/document.rs
@ -0,0 +1,152 @@
+use pyo3::prelude::*;
+use pyo3::types::PyDateTime;
+use pyo3::types::{PyDateAccess, PyTimeAccess};
+
+use chrono::offset::TimeZone;
+use chrono::Utc;
+
+use tantivy as tv;
+
+use crate::facet::Facet;
+use crate::field::{Field, FieldValue};
+
+/// Tantivy's Document is the object that can be indexed and then searched for.
+///
+/// Documents are fundamentally a collection of unordered tuples
+/// (field, value). In this list, one field may appear more than once.
+///
+/// Example:
+///     >>> doc = tantivy.Document()
+///     >>> doc.add_text(title, "The Old Man and the Sea")
+///     >>> doc.add_text(body, ("He was an old man who fished alone in a "
+///                             "skiff in the Gulf Stream and he had gone "
+///                             "eighty-four days now without taking a fish."))
+#[pyclass]
+pub(crate) struct Document {
+    pub(crate) inner: tv::Document,
+}
+
+#[pymethods]
+impl Document {
+    #[new]
+    fn new(obj: &PyRawObject) {
+        obj.init(Document {
+            inner: tv::Document::default(),
+        });
+    }
+
+    /// Add a text value to the document.
+    ///
+    /// Args:
+    ///     field (Field): The field for which we are adding the text.
+    ///     text (str): The text that will be added to the document.
+    fn add_text(&mut self, field: &Field, text: &str) {
+        self.inner.add_text(field.inner, text);
+    }
+
+    /// Add an unsigned integer value to the document.
+    ///
+    /// Args:
+    ///     field (Field): The field for which we are adding the integer.
+    ///     value (int): The integer that will be added to the document.
+    fn add_unsigned(&mut self, field: &Field, value: u64) {
+        self.inner.add_u64(field.inner, value);
+    }
+
+    /// Add a signed integer value to the document.
+    ///
+    /// Args:
+    ///     field (Field): The field for which we are adding the integer.
+    ///     value (int): The integer that will be added to the document.
+    fn add_integer(&mut self, field: &Field, value: i64) {
+        self.inner.add_i64(field.inner, value);
+    }
+
+    /// Add a date value to the document.
+    ///
+    /// Args:
+    ///     field (Field): The field for which we are adding the integer.
+    ///     value (datetime): The date that will be added to the document.
+    fn add_date(&mut self, field: &Field, value: &PyDateTime) {
+        let datetime = Utc
+            .ymd(
+                value.get_year().into(),
+                value.get_month().into(),
+                value.get_day().into(),
+            )
+            .and_hms_micro(
+                value.get_hour().into(),
+                value.get_minute().into(),
+                value.get_second().into(),
+                value.get_microsecond().into(),
+            );
+
+        self.inner.add_date(field.inner, &datetime);
+    }
+
+    /// Add a facet value to the document.
+    /// Args:
+    ///     field (Field): The field for which we are adding the facet.
+    ///     value (Facet): The Facet that will be added to the document.
+    fn add_facet(&mut self, field: &Field, value: &Facet) {
+        self.inner.add_facet(field.inner, value.inner.clone());
+    }
+
+    /// Add a bytes value to the document.
+    ///
+    /// Args:
+    ///     field (Field): The field for which we are adding the bytes.
+    ///     value (bytes): The bytes that will be added to the document.
+    fn add_bytes(&mut self, field: &Field, value: Vec<u8>) {
+        self.inner.add_bytes(field.inner, value);
+    }
+
+    /// Returns the number of added fields that have been added to the document
+    #[getter]
+    fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    /// True if the document is empty, False otherwise.
+    #[getter]
+    fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    /// Get the first value associated with the given field.
+    ///
+    /// Args:
+    ///     field (Field): The field for which we would like to get the value.
+    ///
+    /// Returns the value if one is found, otherwise None.
+    /// The type of the value depends on the field.
+    fn get_first(&self, py: Python, field: &Field) -> Option<PyObject> {
+        let value = self.inner.get_first(field.inner)?;
+        FieldValue::value_to_py(py, value)
+    }
+
+    /// Get the all values associated with the given field.
+    ///
+    /// Args:
+    ///     field (Field): The field for which we would like to get the values.
+    ///
+    /// Returns a list of values.
+    /// The type of the value depends on the field.
+    fn get_all(&self, py: Python, field: &Field) -> Vec<PyObject> {
+        let values = self.inner.get_all(field.inner);
+        values
+            .iter()
+            .map(|&v| FieldValue::value_to_py(py, v))
+            .filter_map(|x| x)
+            .collect()
+    }
+
+    /// Get all the fields and values contained in the document.
+    fn field_values(&self, py: Python) -> Vec<FieldValue> {
+        let field_values = self.inner.field_values();
+        field_values
+            .iter()
+            .map(|v| FieldValue::field_value_to_py(py, v))
+            .collect()
+    }
+}
--- a/src/facet.rs
+++ b/src/facet.rs
@ -0,0 +1,55 @@
+use pyo3::prelude::*;
+use pyo3::types::PyType;
+
+use tantivy::schema;
+
+/// A Facet represent a point in a given hierarchy.
+///
+/// They are typically represented similarly to a filepath. For instance, an
+/// e-commerce website could have a Facet for /electronics/tv_and_video/led_tv.
+///
+/// A document can be associated to any number of facets. The hierarchy
+/// implicitely imply that a document belonging to a facet also belongs to the
+/// ancestor of its facet. In the example above, /electronics/tv_and_video/
+/// and /electronics.
+#[pyclass]
+pub(crate) struct Facet {
+    pub(crate) inner: schema::Facet,
+}
+
+#[pymethods]
+impl Facet {
+    /// Create a new instance of the "root facet" Equivalent to /.
+    #[classmethod]
+    fn root(_cls: &PyType) -> Facet {
+        Facet {
+            inner: schema::Facet::root(),
+        }
+    }
+
+    /// Returns true if the facet is the root facet /.
+    #[getter]
+    fn is_root(&self) -> bool {
+        self.inner.is_root()
+    }
+
+    /// Returns true if another Facet is a subfacet of this facet.
+    /// Args:
+    ///     other (Facet): The Facet that we should check if this facet is a
+    ///         subset of.
+    fn is_prefix_of(&self, other: &Facet) -> bool {
+        self.inner.is_prefix_of(&other.inner)
+    }
+
+    /// Create a Facet object from a string.
+    /// Args:
+    ///     facet_string (str): The string that contains a facet.
+    ///
+    /// Returns the created Facet.
+    #[classmethod]
+    fn from_string(_cls: &PyType, facet_string: &str) -> Facet {
+        Facet {
+            inner: schema::Facet::from_text(facet_string),
+        }
+    }
+}
--- a/src/field.rs
+++ b/src/field.rs
@ -0,0 +1,72 @@
+use pyo3::prelude::*;
+use pyo3::types::PyDateTime;
+
+use tantivy::schema;
+
+use crate::facet::Facet;
+
+/// Field is a numeric indentifier that represents an entry in the Schema.
+#[pyclass]
+#[derive(Clone)]
+pub(crate) struct Field {
+    pub(crate) inner: schema::Field,
+}
+
+/// FieldValue holds together a Field and its Value.
+#[pyclass]
+pub(crate) struct FieldValue {
+    pub(crate) field: Field,
+    pub(crate) value: PyObject,
+}
+
+#[pymethods]
+impl FieldValue {
+    #[getter]
+    fn field(&self) -> Field {
+        self.field.clone()
+    }
+
+    #[getter]
+    fn value(&self) -> &PyObject {
+        &self.value
+    }
+}
+
+impl FieldValue {
+    pub(crate) fn value_to_py(
+        py: Python,
+        value: &schema::Value,
+    ) -> Option<PyObject> {
+        match value {
+            schema::Value::Str(text) => Some(text.into_object(py)),
+            schema::Value::U64(num) => Some(num.into_object(py)),
+            schema::Value::I64(num) => Some(num.into_object(py)),
+            schema::Value::Bytes(b) => Some(b.to_object(py)),
+            schema::Value::Date(d) => {
+                let date =
+                    PyDateTime::from_timestamp(py, d.timestamp() as f64, None);
+
+                match date {
+                    Ok(d) => Some(d.into_object(py)),
+                    Err(_e) => None,
+                }
+            }
+            schema::Value::Facet(f) => {
+                Some(Facet { inner: f.clone() }.into_object(py))
+            }
+        }
+    }
+
+    pub(crate) fn field_value_to_py(
+        py: Python,
+        field_value: &schema::FieldValue,
+    ) -> FieldValue {
+        let value = field_value.value();
+        let field = field_value.field();
+
+        FieldValue {
+            field: Field { inner: field },
+            value: FieldValue::value_to_py(py, value).unwrap(),
+        }
+    }
+}
--- a/src/index.rs
+++ b/src/index.rs
@ -0,0 +1,284 @@
+use pyo3::exceptions;
+use pyo3::prelude::*;
+
+use crate::document::Document;
+use crate::schema::Schema;
+use crate::searcher::Searcher;
+use tantivy as tv;
+use tantivy::directory::MmapDirectory;
+
+const RELOAD_POLICY: &str = "commit";
+
+/// IndexReader is the entry point to read and search the index.
+///
+/// IndexReader controls when a new version of the index should be loaded and
+/// lends you instances of Searcher for the last loaded version.
+///
+/// To create an IndexReader first create an Index and call the reader() method
+/// on the index object.
+#[pyclass]
+pub(crate) struct IndexReader {
+    inner: tv::IndexReader,
+}
+
+#[pymethods]
+impl IndexReader {
+    /// Update searchers so that they reflect the state of the last .commit().
+    ///
+    /// If you set up the the reload policy to be on 'commit' (which is the
+    /// default) every commit should be rapidly reflected on your IndexReader
+    /// and you should not need to call reload() at all.
+    fn reload(&self) -> PyResult<()> {
+        let ret = self.inner.reload();
+        match ret {
+            Ok(_) => Ok(()),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Get a Searcher for the index.
+    ///
+    /// This method should be called every single time a search query is
+    /// performed. The searchers are taken from a pool of num_searchers
+    /// searchers.
+    ///
+    /// Returns a Searcher object, if no searcher is available this may block.
+    fn searcher(&self) -> Searcher {
+        let searcher = self.inner.searcher();
+        Searcher { inner: searcher }
+    }
+}
+
+/// IndexWriter is the user entry-point to add documents to the index.
+///
+/// To create an IndexWriter first create an Index and call the writer() method
+/// on the index object.
+#[pyclass]
+pub(crate) struct IndexWriter {
+    inner: tv::IndexWriter,
+}
+
+#[pymethods]
+impl IndexWriter {
+    /// Add a document to the index.
+    ///
+    /// If the indexing pipeline is full, this call may block.
+    ///
+    /// Returns an `opstamp`, which is an increasing integer that can be used
+    /// by the client to align commits with its own document queue.
+    /// The `opstamp` represents the number of documents that have been added
+    /// since the creation of the index.
+    fn add_document(&mut self, document: &Document) -> PyResult<()> {
+        self.inner.add_document(document.inner.clone());
+        Ok(())
+    }
+
+    /// Commits all of the pending changes
+    ///
+    /// A call to commit blocks. After it returns, all of the document that
+    /// were added since the last commit are published and persisted.
+    ///
+    /// In case of a crash or an hardware failure (as long as the hard disk is
+    /// spared), it will be possible to resume indexing from this point.
+    ///
+    /// Returns the `opstamp` of the last document that made it in the commit.
+    fn commit(&mut self) -> PyResult<()> {
+        let ret = self.inner.commit();
+        match ret {
+            Ok(_) => Ok(()),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Rollback to the last commit
+    ///
+    /// This cancels all of the update that happened before after the last
+    /// commit. After calling rollback, the index is in the same state as it
+    /// was after the last commit.
+    fn rollback(&mut self) -> PyResult<()> {
+        let ret = self.inner.rollback();
+
+        match ret {
+            Ok(_) => Ok(()),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Detect and removes the files that are not used by the index anymore.
+    fn garbage_collect_files(&mut self) -> PyResult<()> {
+        let ret = self.inner.garbage_collect_files();
+
+        match ret {
+            Ok(_) => Ok(()),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// The opstamp of the last successful commit.
+    ///
+    /// This is the opstamp the index will rollback to if there is a failure
+    /// like a power surge.
+    ///
+    /// This is also the opstamp of the commit that is currently available
+    /// for searchers.
+    #[getter]
+    fn commit_opstamp(&self) -> u64 {
+        self.inner.commit_opstamp()
+    }
+}
+
+/// Create a new index object.
+///
+/// Args:
+///     schema (Schema): The schema of the index.
+///     path (str, optional): The path where the index should be stored. If
+///         no path is provided, the index will be stored in memory.
+///     reuse (bool, optional): Should we open an existing index if one exists
+///         or always create a new one.
+///
+/// If an index already exists it will be opened and reused. Raises OSError
+/// if there was a problem during the opening or creation of the index.
+#[pyclass]
+pub(crate) struct Index {
+    pub(crate) inner: tv::Index,
+}
+
+#[pymethods]
+impl Index {
+    #[new]
+    #[args(reuse = true)]
+    fn new(
+        obj: &PyRawObject,
+        schema: &Schema,
+        path: Option<&str>,
+        reuse: bool,
+    ) -> PyResult<()> {
+        let index = match path {
+            Some(p) => {
+                let directory = MmapDirectory::open(p);
+
+                let dir = match directory {
+                    Ok(d) => d,
+                    Err(e) => {
+                        return Err(exceptions::OSError::py_err(e.to_string()))
+                    }
+                };
+
+                let i = if reuse {
+                    tv::Index::open_or_create(dir, schema.inner.clone())
+                } else {
+                    tv::Index::create(dir, schema.inner.clone())
+                };
+
+                match i {
+                    Ok(index) => index,
+                    Err(e) => {
+                        return Err(exceptions::OSError::py_err(e.to_string()))
+                    }
+                }
+            }
+            None => tv::Index::create_in_ram(schema.inner.clone()),
+        };
+
+        obj.init(Index { inner: index });
+        Ok(())
+    }
+
+    /// Create a `IndexWriter` for the index.
+    ///
+    /// The writer will be multithreaded and the provided heap size will be
+    /// split between the given number of threads.
+    ///
+    /// Args:
+    ///     overall_heap_size (int, optional): The total target memory usage of
+    ///         the writer, can't be less than 3000000.
+    ///     num_threads (int, optional): The number of threads that the writer
+    ///         should use. If this value is 0, tantivy will choose
+    ///         automatically the number of threads.
+    ///
+    /// Raises ValueError if there was an error while creating the writer.
+    #[args(heap_size = 3000000, num_threads = 0)]
+    fn writer(
+        &self,
+        heap_size: usize,
+        num_threads: usize,
+    ) -> PyResult<IndexWriter> {
+        let writer = match num_threads {
+            0 => self.inner.writer(heap_size),
+            _ => self.inner.writer_with_num_threads(num_threads, heap_size),
+        };
+
+        match writer {
+            Ok(w) => Ok(IndexWriter { inner: w }),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Create an IndexReader for the index.
+    ///
+    /// Args:
+    ///     reload_policy (str, optional): The reload policy that the
+    ///         IndexReader should use. Can be manual or OnCommit.
+    ///     num_searchers (int, optional): The number of searchers that the
+    ///         reader should create.
+    ///
+    /// Returns the IndexReader on success, raises ValueError if a IndexReader
+    /// couldn't be created.
+    #[args(reload_policy = "RELOAD_POLICY", num_searchers = 0)]
+    fn reader(
+        &self,
+        reload_policy: &str,
+        num_searchers: usize,
+    ) -> PyResult<IndexReader> {
+        let reload_policy = reload_policy.to_lowercase();
+        let reload_policy = match reload_policy.as_ref() {
+            "commit" => tv::ReloadPolicy::OnCommit,
+            "on-commit" => tv::ReloadPolicy::OnCommit,
+            "oncommit" => tv::ReloadPolicy::OnCommit,
+            "manual" => tv::ReloadPolicy::Manual,
+            _ => return Err(exceptions::ValueError::py_err(
+                "Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
+            ))
+        };
+
+        let builder = self.inner.reader_builder();
+
+        let builder = builder.reload_policy(reload_policy);
+        let builder = if num_searchers > 0 {
+            builder.num_searchers(num_searchers)
+        } else {
+            builder
+        };
+
+        let reader = builder.try_into();
+        match reader {
+            Ok(r) => Ok(IndexReader { inner: r }),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Check if the given path contains an existing index.
+    /// Args:
+    ///     path: The path where tantivy will search for an index.
+    ///
+    /// Returns True if an index exists at the given path, False otherwise.
+    ///
+    /// Raises OSError if the directory cannot be opened.
+    #[staticmethod]
+    fn exists(path: &str) -> PyResult<bool> {
+        let directory = MmapDirectory::open(path);
+        let dir = match directory {
+            Ok(d) => d,
+            Err(e) => return Err(exceptions::OSError::py_err(e.to_string())),
+        };
+
+        Ok(tv::Index::exists(&dir))
+    }
+
+    /// The schema of the current index.
+    #[getter]
+    fn schema(&self) -> Schema {
+        let schema = self.inner.schema();
+        Schema { inner: schema }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,88 @@
+use pyo3::prelude::*;
+
+mod document;
+mod facet;
+mod field;
+mod index;
+mod query;
+mod schema;
+mod schemabuilder;
+mod searcher;
+
+use document::Document;
+use facet::Facet;
+use field::{Field, FieldValue};
+use index::Index;
+use query::QueryParser;
+use schema::Schema;
+use schemabuilder::SchemaBuilder;
+use searcher::{DocAddress, Searcher, TopDocs};
+
+/// Python bindings for the search engine library Tantivy.
+///
+/// Tantivy is a full text search engine library written in rust.
+///
+/// It is closer to Apache Lucene than to Elasticsearch and Apache Solr in
+/// the sense it is not an off-the-shelf search engine server, but rather
+/// a library that can be used to build such a search engine.
+/// Tantivy is, in fact, strongly inspired by Lucene's design.
+///
+/// Example:
+///     >>> import json
+///     >>> import tantivy
+///
+///     >>> builder = tantivy.SchemaBuilder()
+///
+///     >>> title = builder.add_text_field("title", stored=True)
+///     >>> body = builder.add_text_field("body")
+///
+///     >>> schema = builder.build()
+///     >>> index = tantivy.Index(schema)
+///     >>> doc = tantivy.Document()
+///     >>> doc.add_text(title, "The Old Man and the Sea")
+///     >>> doc.add_text(body, ("He was an old man who fished alone in a "
+///                             "skiff in the Gulf Stream and he had gone "
+///                             "eighty-four days now without taking a fish."))
+///
+///     >>> writer.add_document(doc)
+///
+///     >>> doc = schema.parse_document(json.dumps({
+///            "title": ["Frankenstein", "The Modern Prometheus"],
+///            "body": ("You will rejoice to hear that no disaster has "
+///                     "accompanied the commencement of an enterprise which "
+///                     "you have regarded with such evil forebodings.  "
+///                     "I arrived here yesterday, and my first task is to "
+///                     "assure my dear sister of my welfare and increasing "
+///                     "confidence in the success of my undertaking.")
+///     }))
+///
+///     >>> writer.add_document(doc)
+///     >>> writer.commit()
+///
+///     >>> reader = index.reader()
+///     >>> searcher = reader.searcher()
+///
+///     >>> query_parser = tantivy.QueryParser.for_index(index, [title, body])
+///     >>> query = query_parser.parse_query("sea whale")
+///
+///     >>> top_docs = tantivy.TopDocs.with_limit(10)
+///     >>> result = searcher.search(query, top_docs)
+///
+///     >>> assert len(result) == 1
+///
+#[pymodule]
+fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> {
+    m.add_class::<Schema>()?;
+    m.add_class::<SchemaBuilder>()?;
+    m.add_class::<Searcher>()?;
+    m.add_class::<Index>()?;
+    m.add_class::<QueryParser>()?;
+    m.add_class::<Document>()?;
+    m.add_class::<DocAddress>()?;
+    m.add_class::<TopDocs>()?;
+    m.add_class::<Field>()?;
+    m.add_class::<FieldValue>()?;
+    m.add_class::<Facet>()?;
+
+    Ok(())
+}
--- a/src/query.rs
+++ b/src/query.rs
@ -0,0 +1,70 @@
+use pyo3::exceptions;
+use pyo3::prelude::*;
+use pyo3::types::PyType;
+
+use tantivy as tv;
+
+use crate::field::Field;
+use crate::index::Index;
+
+/// Tantivy's Query
+#[pyclass]
+pub(crate) struct Query {
+    pub(crate) inner: Box<dyn tv::query::Query>,
+}
+
+/// Tantivy's Query parser
+#[pyclass]
+pub(crate) struct QueryParser {
+    inner: tv::query::QueryParser,
+}
+
+#[pymethods]
+impl QueryParser {
+    /// Creates a QueryParser for an Index.
+    ///
+    /// Args:
+    ///     index (Index): The index for which the query will be created.
+    ///     default_fields (List[Field]): A list of fields used to search if no
+    ///         field is specified in the query.
+    ///
+    /// Returns the QueryParser.
+    #[classmethod]
+    fn for_index(
+        _cls: &PyType,
+        index: &Index,
+        default_fields: Vec<&Field>,
+    ) -> PyResult<QueryParser> {
+        let default_fields: Vec<tv::schema::Field> =
+            default_fields.iter().map(|&f| f.inner.clone()).collect();
+
+        let parser =
+            tv::query::QueryParser::for_index(&index.inner, default_fields);
+        Ok(QueryParser { inner: parser })
+    }
+
+    /// Parse a string into a query that can be given to a searcher.
+    ///
+    /// Args:
+    ///     query (str): A query string that should be parsed into a query.
+    ///
+    /// Returns the parsed Query object. Raises ValueError if there was an
+    /// error with the query string.
+    fn parse_query(&self, query: &str) -> PyResult<Query> {
+        let ret = self.inner.parse_query(query);
+
+        match ret {
+            Ok(q) => Ok(Query { inner: q }),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Set the default way to compose queries to a conjunction.
+    ///
+    /// By default, the query happy tax payer is equivalent to the query happy
+    /// OR tax OR payer. After calling .set_conjunction_by_default() happy tax
+    /// payer will be interpreted by the parser as happy AND tax AND payer.
+    fn set_conjunction_by_default(&mut self) {
+        self.inner.set_conjunction_by_default();
+    }
+}
--- a/src/schema.rs
+++ b/src/schema.rs
@ -0,0 +1,65 @@
+use pyo3::exceptions;
+use pyo3::prelude::*;
+
+use tantivy::schema;
+
+use crate::document::Document;
+use crate::field::Field;
+
+/// Tantivy schema.
+///
+/// The schema is very strict. To build the schema the `SchemaBuilder` class is
+/// provided.
+#[pyclass]
+pub(crate) struct Schema {
+    pub(crate) inner: schema::Schema,
+}
+
+#[pymethods]
+impl Schema {
+    /// Build a document object from a json string.
+    ///
+    /// Args:
+    ///     doc_json (str) - A string containing json that should be parsed
+    ///         into a `Document`
+    ///
+    /// Returns the parsed document, raises a ValueError if the parsing failed.
+    fn parse_document(&self, doc_json: &str) -> PyResult<Document> {
+        let ret = self.inner.parse_document(doc_json);
+        match ret {
+            Ok(d) => Ok(Document { inner: d }),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Convert a `Document` object into a json string.
+    ///
+    /// Args:
+    ///     doc (Document): The document that will be converted into a json
+    ///         string.
+    fn to_json(&self, doc: &Document) -> String {
+        self.inner.to_json(&doc.inner)
+    }
+
+    /// Return the field name for a given `Field`.
+    ///
+    /// Args:
+    ///     field (Field): The field for which the name will be returned.
+    fn get_field_name(&self, field: &Field) -> &str {
+        self.inner.get_field_name(field.inner)
+    }
+
+    /// Returns the field option associated with a given name.
+    ///
+    /// Args:
+    ///     name (str): The name of the field that we want to retrieve.
+    ///
+    /// Returns the Field if one is found, None otherwise.
+    fn get_field(&self, name: &str) -> Option<Field> {
+        let f = self.inner.get_field(name);
+        match f {
+            Some(field) => Some(Field { inner: field }),
+            None => None,
+        }
+    }
+}
--- a/src/schemabuilder.rs
+++ b/src/schemabuilder.rs
@ -0,0 +1,318 @@
+use pyo3::exceptions;
+use pyo3::prelude::*;
+
+use tantivy::schema;
+
+use crate::field::Field;
+use crate::schema::Schema;
+
+/// Tantivy has a very strict schema.
+/// You need to specify in advance whether a field is indexed or not,
+/// stored or not.
+///
+/// This is done by creating a schema object, and
+/// setting up the fields one by one.
+///
+/// Examples:
+///
+///     >>> builder = tantivy.SchemaBuilder()
+///
+///     >>> title = builder.add_text_field("title", stored=True)
+///     >>> body = builder.add_text_field("body")
+///
+///     >>> schema = builder.build()
+#[pyclass]
+pub(crate) struct SchemaBuilder {
+    pub(crate) builder: Option<schema::SchemaBuilder>,
+}
+
+const TOKENIZER: &str = "default";
+const RECORD: &str = "position";
+
+#[pymethods]
+impl SchemaBuilder {
+    #[new]
+    fn new(obj: &PyRawObject) {
+        obj.init(SchemaBuilder {
+            builder: Some(schema::Schema::builder()),
+        });
+    }
+
+    /// Add a new text field to the schema.
+    ///
+    /// Args:
+    ///     name (str): The name of the field.
+    ///     stored (bool, optional): If true sets the field as stored, the
+    ///         content of the field can be later restored from a Searcher.
+    ///         Defaults to False.
+    ///     tokenizer_name (str, optional): The name of the tokenizer that
+    ///         should be used to process the field. Defaults to 'default'
+    ///     index_option (str, optional): Sets which information should be
+    ///         indexed with the tokens. Can be one of 'position', 'freq' or
+    ///         'basic'. Defaults to 'position'. The 'basic' index_option
+    ///         records only the document ID, the 'freq' option records the
+    ///         document id and the term frequency, while the 'position' option
+    ///         records the document id, term frequency and the positions of
+    ///         the term occurrences in the document.
+    ///
+    /// Returns the associated field handle.
+    /// Raises a ValueError if there was an error with the field creation.
+    #[args(
+        stored = false,
+        tokenizer_name = "TOKENIZER",
+        index_option = "RECORD"
+    )]
+    fn add_text_field(
+        &mut self,
+        name: &str,
+        stored: bool,
+        tokenizer_name: &str,
+        index_option: &str,
+    ) -> PyResult<Field> {
+        let builder = &mut self.builder;
+
+        let index_option = match index_option {
+            "position" => schema::IndexRecordOption::WithFreqsAndPositions,
+            "freq" => schema::IndexRecordOption::WithFreqs,
+            "basic" => schema::IndexRecordOption::Basic,
+            _ => return Err(exceptions::ValueError::py_err(
+                "Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
+            ))
+        };
+
+        let indexing = schema::TextFieldIndexing::default()
+            .set_tokenizer(tokenizer_name)
+            .set_index_option(index_option);
+
+        let options =
+            schema::TextOptions::default().set_indexing_options(indexing);
+        let options = if stored {
+            options.set_stored()
+        } else {
+            options
+        };
+
+        if let Some(builder) = builder {
+            let field = builder.add_text_field(name, options);
+            Ok(Field { inner: field })
+        } else {
+            Err(exceptions::ValueError::py_err(
+                "Schema builder object isn't valid anymore.",
+            ))
+        }
+    }
+
+    /// Add a new signed integer field to the schema.
+    ///
+    /// Args:
+    ///     name (str): The name of the field.
+    ///     stored (bool, optional): If true sets the field as stored, the
+    ///         content of the field can be later restored from a Searcher.
+    ///         Defaults to False.
+    ///     indexed (bool, optional): If true sets the field to be indexed.
+    ///     fast (str, optional): Set the u64 options as a single-valued fast
+    ///         field. Fast fields are designed for random access. Access time
+    ///         are similar to a random lookup in an array. If more than one
+    ///         value is associated to a fast field, only the last one is kept.
+    ///         Can be one of 'single' or 'multi'. If this is set to 'single,
+    ///         the document must have exactly one value associated to the
+    ///         document. If this is set to 'multi', the document can have any
+    ///         number of values associated to the document. Defaults to None,
+    ///         which disables this option.
+    ///
+    /// Returns the associated field handle.
+    /// Raises a ValueError if there was an error with the field creation.
+    #[args(stored = false, indexed = false)]
+    fn add_integer_field(
+        &mut self,
+        name: &str,
+        stored: bool,
+        indexed: bool,
+        fast: Option<&str>,
+    ) -> PyResult<Field> {
+        let builder = &mut self.builder;
+
+        let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
+
+        if let Some(builder) = builder {
+            let field = builder.add_i64_field(name, opts);
+            Ok(Field { inner: field })
+        } else {
+            Err(exceptions::ValueError::py_err(
+                "Schema builder object isn't valid anymore.",
+            ))
+        }
+    }
+
+    /// Add a new unsigned integer field to the schema.
+    ///
+    /// Args:
+    ///     name (str): The name of the field.
+    ///     stored (bool, optional): If true sets the field as stored, the
+    ///         content of the field can be later restored from a Searcher.
+    ///         Defaults to False.
+    ///     indexed (bool, optional): If true sets the field to be indexed.
+    ///     fast (str, optional): Set the u64 options as a single-valued fast
+    ///         field. Fast fields are designed for random access. Access time
+    ///         are similar to a random lookup in an array. If more than one
+    ///         value is associated to a fast field, only the last one is kept.
+    ///         Can be one of 'single' or 'multi'. If this is set to 'single,
+    ///         the document must have exactly one value associated to the
+    ///         document. If this is set to 'multi', the document can have any
+    ///         number of values associated to the document. Defaults to None,
+    ///         which disables this option.
+    ///
+    /// Returns the associated field handle.
+    /// Raises a ValueError if there was an error with the field creation.
+    #[args(stored = false, indexed = false)]
+    fn add_unsigned_field(
+        &mut self,
+        name: &str,
+        stored: bool,
+        indexed: bool,
+        fast: Option<&str>,
+    ) -> PyResult<Field> {
+        let builder = &mut self.builder;
+
+        let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
+
+        if let Some(builder) = builder {
+            let field = builder.add_u64_field(name, opts);
+            Ok(Field { inner: field })
+        } else {
+            Err(exceptions::ValueError::py_err(
+                "Schema builder object isn't valid anymore.",
+            ))
+        }
+    }
+
+    /// Add a new date field to the schema.
+    ///
+    /// Args:
+    ///     name (str): The name of the field.
+    ///     stored (bool, optional): If true sets the field as stored, the
+    ///         content of the field can be later restored from a Searcher.
+    ///         Defaults to False.
+    ///     indexed (bool, optional): If true sets the field to be indexed.
+    ///     fast (str, optional): Set the u64 options as a single-valued fast
+    ///         field. Fast fields are designed for random access. Access time
+    ///         are similar to a random lookup in an array. If more than one
+    ///         value is associated to a fast field, only the last one is kept.
+    ///         Can be one of 'single' or 'multi'. If this is set to 'single,
+    ///         the document must have exactly one value associated to the
+    ///         document. If this is set to 'multi', the document can have any
+    ///         number of values associated to the document. Defaults to None,
+    ///         which disables this option.
+    ///
+    /// Returns the associated field handle.
+    /// Raises a ValueError if there was an error with the field creation.
+    #[args(stored = false, indexed = false)]
+    fn add_date_field(
+        &mut self,
+        name: &str,
+        stored: bool,
+        indexed: bool,
+        fast: Option<&str>,
+    ) -> PyResult<Field> {
+        let builder = &mut self.builder;
+
+        let opts = SchemaBuilder::build_int_option(stored, indexed, fast)?;
+
+        if let Some(builder) = builder {
+            let field = builder.add_date_field(name, opts);
+            Ok(Field { inner: field })
+        } else {
+            Err(exceptions::ValueError::py_err(
+                "Schema builder object isn't valid anymore.",
+            ))
+        }
+    }
+
+    /// Add a Facet field to the schema.
+    /// Args:
+    ///     name (str): The name of the field.
+    fn add_facet_field(&mut self, name: &str) -> PyResult<Field> {
+        let builder = &mut self.builder;
+
+        if let Some(builder) = builder {
+            let field = builder.add_facet_field(name);
+            Ok(Field { inner: field })
+        } else {
+            Err(exceptions::ValueError::py_err(
+                "Schema builder object isn't valid anymore.",
+            ))
+        }
+    }
+
+    /// Add a fast bytes field to the schema.
+    ///
+    /// Bytes field are not searchable and are only used
+    /// as fast field, to associate any kind of payload
+    /// to a document.
+    ///
+    /// Args:
+    ///     name (str): The name of the field.
+    fn add_bytes_field(&mut self, name: &str) -> PyResult<Field> {
+        let builder = &mut self.builder;
+
+        if let Some(builder) = builder {
+            let field = builder.add_bytes_field(name);
+            Ok(Field { inner: field })
+        } else {
+            Err(exceptions::ValueError::py_err(
+                "Schema builder object isn't valid anymore.",
+            ))
+        }
+    }
+
+    /// Finalize the creation of a Schema.
+    ///
+    /// Returns a Schema object. After this is called the SchemaBuilder cannot
+    /// be used anymore.
+    fn build(&mut self) -> PyResult<Schema> {
+        let builder = self.builder.take();
+        if let Some(builder) = builder {
+            let schema = builder.build();
+            Ok(Schema { inner: schema })
+        } else {
+            Err(exceptions::ValueError::py_err(
+                "Schema builder object isn't valid anymore.",
+            ))
+        }
+    }
+}
+
+impl SchemaBuilder {
+    fn build_int_option(
+        stored: bool,
+        indexed: bool,
+        fast: Option<&str>,
+    ) -> PyResult<schema::IntOptions> {
+        let opts = schema::IntOptions::default();
+
+        let opts = if stored { opts.set_stored() } else { opts };
+        let opts = if indexed { opts.set_indexed() } else { opts };
+
+        let fast = match fast {
+            Some(f) => {
+                let f = f.to_lowercase();
+                match f.as_ref() {
+                    "single" => Some(schema::Cardinality::SingleValue),
+                    "multi" => Some(schema::Cardinality::MultiValues),
+                    _ => return Err(exceptions::ValueError::py_err(
+                        "Invalid index option, valid choices are: 'multivalue' and 'singlevalue'"
+                    )),
+                }
+            }
+            None => None,
+        };
+
+        let opts = if let Some(f) = fast {
+            opts.set_fast(f)
+        } else {
+            opts
+        };
+
+        Ok(opts)
+    }
+}
--- a/src/searcher.rs
+++ b/src/searcher.rs
@ -0,0 +1,134 @@
+use pyo3::exceptions;
+use pyo3::prelude::*;
+
+use tantivy as tv;
+
+use crate::document::Document;
+use crate::query::Query;
+
+/// Tantivy's Searcher class
+///
+/// A Searcher is used to search the index given a prepared Query.
+#[pyclass]
+pub(crate) struct Searcher {
+    pub(crate) inner: tv::LeasedItem<tv::Searcher>,
+}
+
+#[pymethods]
+impl Searcher {
+    /// Search the index with the given query and collect results.
+    ///
+    /// Args:
+    ///     query (Query): The query that will be used for the search.
+    ///     collector (Collector): A collector that determines how the search
+    ///         results will be collected. Only the TopDocs collector is
+    ///         supported for now.
+    ///
+    /// Returns a list of tuples that contains the scores and DocAddress of the
+    /// search results.
+    ///
+    /// Raises a ValueError if there was an error with the search.
+    fn search(
+        &self,
+        query: &Query,
+        collector: &mut TopDocs,
+    ) -> PyResult<Vec<(f32, DocAddress)>> {
+        let ret = self.inner.search(&query.inner, &collector.inner);
+        match ret {
+            Ok(r) => {
+                let result: Vec<(f32, DocAddress)> = r
+                    .iter()
+                    .map(|(f, d)| (f.clone(), DocAddress::from(d)))
+                    .collect();
+                Ok(result)
+            }
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+
+    /// Returns the overall number of documents in the index.
+    #[getter]
+    fn num_docs(&self) -> u64 {
+        self.inner.num_docs()
+    }
+
+    /// Fetches a document from Tantivy's store given a DocAddress.
+    ///
+    /// Args:
+    ///     doc_address (DocAddress): The DocAddress that is associated with
+    ///         the document that we wish to fetch.
+    ///
+    /// Returns the Document, raises ValueError if the document can't be found.
+    fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
+        let ret = self.inner.doc(doc_address.into());
+        match ret {
+            Ok(doc) => Ok(Document { inner: doc }),
+            Err(e) => Err(exceptions::ValueError::py_err(e.to_string())),
+        }
+    }
+}
+
+/// DocAddress contains all the necessary information to identify a document
+/// given a Searcher object.
+///
+/// It consists in an id identifying its segment, and its segment-local DocId.
+/// The id used for the segment is actually an ordinal in the list of segment
+/// hold by a Searcher.
+#[pyclass]
+pub(crate) struct DocAddress {
+    pub(crate) segment_ord: tv::SegmentLocalId,
+    pub(crate) doc: tv::DocId,
+}
+
+#[pymethods]
+impl DocAddress {
+    /// The segment ordinal is an id identifying the segment hosting the
+    /// document. It is only meaningful, in the context of a searcher.
+    #[getter]
+    fn segment_ord(&self) -> u32 {
+        self.segment_ord
+    }
+
+    /// The segment local DocId
+    #[getter]
+    fn doc(&self) -> u32 {
+        self.doc
+    }
+}
+
+impl From<&tv::DocAddress> for DocAddress {
+    fn from(doc_address: &tv::DocAddress) -> Self {
+        DocAddress {
+            segment_ord: doc_address.segment_ord(),
+            doc: doc_address.doc(),
+        }
+    }
+}
+
+impl Into<tv::DocAddress> for &DocAddress {
+    fn into(self) -> tv::DocAddress {
+        tv::DocAddress(self.segment_ord(), self.doc())
+    }
+}
+
+/// The Top Score Collector keeps track of the K documents sorted by their
+/// score.
+///
+/// Args:
+///     limit (int, optional): The number of documents that the top scorer will
+///         retrieve. Must be a positive integer larger than 0. Defaults to 10.
+#[pyclass]
+pub(crate) struct TopDocs {
+    inner: tv::collector::TopDocs,
+}
+
+#[pymethods]
+impl TopDocs {
+    #[new]
+    #[args(limit = 10)]
+    fn new(obj: &PyRawObject, limit: usize) -> PyResult<()> {
+        let top = tv::collector::TopDocs::with_limit(limit);
+        obj.init(TopDocs { inner: top });
+        Ok(())
+    }
+}
--- a/tantivy/init.py
+++ b/tantivy/init.py
@ -0,0 +1 @@
+from .tantivy import *
--- a/tests/tantivy_test.py
+++ b/tests/tantivy_test.py
@ -0,0 +1,85 @@
+import json
+import tantivy
+
+
+class TestClass(object):
+    def test_simple_search(self):
+        builder = tantivy.SchemaBuilder()
+
+        title = builder.add_text_field("title", stored=True)
+        body = builder.add_text_field("body")
+
+        schema = builder.build()
+        index = tantivy.Index(schema)
+
+        writer = index.writer()
+
+        doc = tantivy.Document()
+        doc.add_text(title, "The Old Man and the Sea")
+        doc.add_text(body, ("He was an old man who fished alone in a skiff in"
+                            "the Gulf Stream and he had gone eighty-four days "
+                            "now without taking a fish."))
+        writer.add_document(doc)
+
+        doc = schema.parse_document(json.dumps({
+            "title": "Of Mice and Men",
+            "body": ("A few miles south of Soledad, the Salinas River drops "
+                     "in close to the hillside bank and runs deep and "
+                     "green. The water is warm too, for it has slipped "
+                     "twinkling over the yellow sands in the sunlight "
+                     "before reaching the narrow pool. On one side of the "
+                     "river the golden foothill slopes curve up to the "
+                     "strong and rocky Gabilan Mountains, but on the valley "
+                     "side the water is lined with trees—willows fresh and "
+                     "green with every spring, carrying in their lower leaf "
+                     "junctures the debris of the winter’s flooding; and "
+                     "sycamores with mottled, white, recumbent limbs and "
+                     "branches that arch over the pool")
+        }))
+
+        writer.add_document(doc)
+
+        doc = schema.parse_document(json.dumps({
+            "title": ["Frankenstein", "The Modern Prometheus"],
+            "body": ("You will rejoice to hear that no disaster has "
+                     "accompanied the commencement of an enterprise which you "
+                     "have regarded with such evil forebodings.  I arrived "
+                     "here yesterday, and my first task is to assure my dear "
+                     "sister of my welfare and increasing confidence in the "
+                     "success of my undertaking.")
+        }))
+
+        writer.add_document(doc)
+        writer.commit()
+
+        reader = index.reader()
+        searcher = reader.searcher()
+
+        query_parser = tantivy.QueryParser.for_index(index, [title, body])
+        query = query_parser.parse_query("sea whale")
+
+        top_docs = tantivy.TopDocs(10)
+
+        result = searcher.search(query, top_docs)
+        print(result)
+
+        assert len(result) == 1
+
+        _, doc_address = result[0]
+
+        searched_doc = searcher.doc(doc_address)
+        assert searched_doc.get_first(title) == "The Old Man and the Sea"
+
+    def test_doc(self):
+        builder = tantivy.SchemaBuilder()
+        title = builder.add_text_field("title", stored=True)
+
+        doc = tantivy.Document()
+        assert doc.is_empty
+
+        doc.add_text(title, "The Old Man and the Sea")
+
+        assert doc.get_first(title) == "The Old Man and the Sea"
+
+        assert doc.len == 1
+        assert not doc.is_empty