Adding delete_document function to index (#12)

* Adding Term, Field and delete_term API mapping * Fixing test * Fixing code doc * Removing Term and Field and nesting code on delete_term * Fixing lint * Delete_documents and documentation * Fixing style and testing exceptions * Fixing missing return * Fixing fmt
2019-09-09 14:30:50 +02:00 · 2019-09-09 14:30:50 +02:00 · 75f0670dd6
commit 75f0670dd6
parent 51f080d001
3 changed files with 109 additions and 38 deletions
--- a/src/document.rs
+++ b/src/document.rs
@ -109,7 +109,7 @@ where
        .push(Value::from(value));
 }

-fn extract_value(any: &PyAny) -> PyResult<Value> {
+pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
    if let Ok(s) = any.extract::<String>() {
        return Ok(Value::Str(s));
    }
--- a/src/index.rs
+++ b/src/index.rs
@ -2,15 +2,16 @@

 use pyo3::exceptions;
 use pyo3::prelude::*;
+use pyo3::types::PyAny;

-use crate::document::Document;
+use crate::document::{extract_value, Document};
 use crate::query::Query;
 use crate::schema::Schema;
 use crate::searcher::Searcher;
 use crate::to_pyerr;
 use tantivy as tv;
 use tantivy::directory::MmapDirectory;
-use tantivy::schema::{Field, NamedFieldDocument};
+use tantivy::schema::{Field, NamedFieldDocument, Term, Value};

 const RELOAD_POLICY: &str = "commit";

@ -94,6 +95,44 @@ impl IndexWriter {
    fn commit_opstamp(&self) -> u64 {
        self.inner_index_writer.commit_opstamp()
    }
+
+    /// Delete all documents containing a given term.
+    ///
+    /// Args:
+    ///     field_name (str): The field name for which we want to filter deleted docs.
+    ///     field_value (PyAny): Python object with the value we want to filter.
+    ///
+    /// If the field_name is not on the schema raises ValueError exception.
+    /// If the field_value is not supported raises Exception.
+    fn delete_documents(
+        &mut self,
+        field_name: &str,
+        field_value: &PyAny,
+    ) -> PyResult<u64> {
+        let field = self.schema.get_field(field_name).ok_or_else(|| {
+            exceptions::ValueError::py_err(format!(
+                "Field `{}` is not defined in the schema.",
+                field_name
+            ))
+        })?;
+
+        let value = extract_value(field_value)?;
+        let term = match value {
+            Value::Str(text) => Term::from_field_text(field, &text),
+            Value::U64(num) => Term::from_field_u64(field, num),
+            Value::I64(num) => Term::from_field_i64(field, num),
+            Value::F64(num) => Term::from_field_f64(field, num),
+            Value::Date(d) => Term::from_field_date(field, &d),
+            Value::Facet(facet) => Term::from_facet(field, &facet),
+            Value::Bytes(_) => {
+                return Err(exceptions::ValueError::py_err(format!(
+                    "Field `{}` is bytes type not deletable.",
+                    field_name
+                )))
+            }
+        };
+        Ok(self.inner_index_writer.delete_term(term.clone()))
+    }
 }

 /// Create a new index object.
--- a/tests/tantivy_test.py
+++ b/tests/tantivy_test.py
@ -5,10 +5,7 @@ from tantivy import Document, Index, SchemaBuilder, Schema


 def schema():
-    return SchemaBuilder() \
-        .add_text_field("title", stored=True) \
-        .add_text_field("body") \
-        .build()
+    return SchemaBuilder().add_text_field("title", stored=True).add_text_field("body").build()


@pytest.fixture(scope="class")
@ -24,15 +21,22 @@ def ram_index():
    # create a document instance
    # add field-value pairs
    doc.add_text("title", "The Old Man and the Sea")
-    doc.add_text("body", ("He was an old man who fished alone in a skiff in"
+    doc.add_text(
+        "body",
+        (
+            "He was an old man who fished alone in a skiff in"
            "the Gulf Stream and he had gone eighty-four days "
-                          "now without taking a fish."))
+            "now without taking a fish."
+        ),
+    )
    writer.add_document(doc)
    # 2 use the built-in json support
    # keys need to coincide with field names
-    doc = Document.from_dict({
+    doc = Document.from_dict(
+        {
            "title": "Of Mice and Men",
-        "body": ("A few miles south of Soledad, the Salinas River drops "
+            "body": (
+                "A few miles south of Soledad, the Salinas River drops "
                "in close to the hillside bank and runs deep and "
                "green. The water is warm too, for it has slipped "
                "twinkling over the yellow sands in the sunlight "
@ -43,20 +47,23 @@ def ram_index():
                "green with every spring, carrying in their lower leaf "
                "junctures the debris of the winter’s flooding; and "
                "sycamores with mottled, white, recumbent limbs and "
-                 "branches that arch over the pool")
-    })
+                "branches that arch over the pool"
+            ),
+        }
+    )
    writer.add_document(doc)
-    writer.add_json("""{
+    writer.add_json(
+        """{
            "title": ["Frankenstein", "The Modern Prometheus"],
            "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings.  I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
-        }""")
+        }"""
+    )
    writer.commit()
    index.reload()
    return index


 class TestClass(object):
-
    def test_simple_search(self, ram_index):
        index = ram_index
        query = index.parse_query("sea whale", ["title", "body"])
@ -91,10 +98,12 @@ class TestClass(object):

    def test_and_query_parser_default_fields_undefined(self, ram_index):
        query = ram_index.parse_query("winter")
-        assert repr(query) == "Query(BooleanQuery { subqueries: [" \
-                              "(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " \
-                              "(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \
+        assert (
+            repr(query) == "Query(BooleanQuery { subqueries: ["
+            "(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), "
+            "(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] "
            "})"
+        )

    def test_query_errors(self, ram_index):
        index = ram_index
@ -103,11 +112,33 @@ class TestClass(object):
            index.parse_query("bod:men", ["title", "body"])


+class TestUpdateClass(object):
+    def test_delete_update(self, ram_index):
+        query = ram_index.parse_query("Frankenstein", ["title"])
+        top_docs = tantivy.TopDocs(10)
+        result = ram_index.searcher().search(query, top_docs)
+        assert len(result) == 1
+
+        writer = ram_index.writer()
+
+        with pytest.raises(ValueError):
+            writer.delete_documents("fake_field", "frankenstein")
+
+        with pytest.raises(ValueError):
+            writer.delete_documents("title", b"frankenstein")
+
+        writer.delete_documents("title", "frankenstein")
+        writer.commit()
+        ram_index.reload()
+
+        result = ram_index.searcher().search(query, top_docs)
+        assert len(result) == 0
+
+
 PATH_TO_INDEX = "tests/test_index/"


 class TestFromDiskClass(object):
-
    def test_exists(self):
        # prefer to keep it separate in case anyone deletes this
        # runs from the root directory
@ -136,6 +167,7 @@ class TestFromDiskClass(object):
        writer.add_document(Document(title="mytitle2", body="mybody2"))
        writer.commit()
        import time
+
        for i in range(50):
            # The index should be automatically reloaded.
            # Wait for at most 5s for it to happen.
@ -151,7 +183,6 @@ class TestSearcher(object):


 class TestDocument(object):
-
    def test_document(self):
        doc = tantivy.Document(name="Bill", reference=[1, 2])
        assert doc["reference"] == [1, 2]
@ -162,7 +193,8 @@ class TestDocument(object):

    def test_document_with_date(self):
        import datetime
-        date = datetime.datetime(2019, 8, 12, 13, 0, 0, )
+
+        date = datetime.datetime(2019, 8, 12, 13, 0, 0)
        doc = tantivy.Document(name="Bill", date=date)
        assert doc["date"][0] == date

@ -174,15 +206,15 @@ class TestDocument(object):
        doc = tantivy.Document()
        facet = tantivy.Facet.from_string("/europe/france")
        doc.add_facet("facet", facet)
-        assert doc["facet"][0].to_path() == ['europe', 'france']
+        assert doc["facet"][0].to_path() == ["europe", "france"]
        doc = tantivy.Document()
        facet = tantivy.Facet.from_string("/asia\\/oceania/fiji")
        doc.add_facet("facet", facet)
-        assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
+        assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]
        assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji"
        assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)"
        doc = tantivy.Document(facet=facet)
-        assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji']
+        assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]

    def test_document_error(self):
        with pytest.raises(ValueError):