From 75f0670dd619d653248c3d8ad904ef9a9be4c214 Mon Sep 17 00:00:00 2001 From: Ramon Navarro Bosch Date: Mon, 9 Sep 2019 14:30:50 +0200 Subject: [PATCH] Adding delete_document function to index (#12) * Adding Term, Field and delete_term API mapping * Fixing test * Fixing code doc * Removing Term and Field and nesting code on delete_term * Fixing lint * Delete_documents and documentation * Fixing style and testing exceptions * Fixing missing return * Fixing fmt --- src/document.rs | 2 +- src/index.rs | 43 +++++++++++++++++- tests/tantivy_test.py | 102 +++++++++++++++++++++++++++--------------- 3 files changed, 109 insertions(+), 38 deletions(-) diff --git a/src/document.rs b/src/document.rs index 84b6cce..cffe861 100644 --- a/src/document.rs +++ b/src/document.rs @@ -109,7 +109,7 @@ where .push(Value::from(value)); } -fn extract_value(any: &PyAny) -> PyResult { +pub(crate) fn extract_value(any: &PyAny) -> PyResult { if let Ok(s) = any.extract::() { return Ok(Value::Str(s)); } diff --git a/src/index.rs b/src/index.rs index f02bdbf..fd80297 100644 --- a/src/index.rs +++ b/src/index.rs @@ -2,15 +2,16 @@ use pyo3::exceptions; use pyo3::prelude::*; +use pyo3::types::PyAny; -use crate::document::Document; +use crate::document::{extract_value, Document}; use crate::query::Query; use crate::schema::Schema; use crate::searcher::Searcher; use crate::to_pyerr; use tantivy as tv; use tantivy::directory::MmapDirectory; -use tantivy::schema::{Field, NamedFieldDocument}; +use tantivy::schema::{Field, NamedFieldDocument, Term, Value}; const RELOAD_POLICY: &str = "commit"; @@ -94,6 +95,44 @@ impl IndexWriter { fn commit_opstamp(&self) -> u64 { self.inner_index_writer.commit_opstamp() } + + /// Delete all documents containing a given term. + /// + /// Args: + /// field_name (str): The field name for which we want to filter deleted docs. + /// field_value (PyAny): Python object with the value we want to filter. + /// + /// If the field_name is not on the schema raises ValueError exception. + /// If the field_value is not supported raises Exception. + fn delete_documents( + &mut self, + field_name: &str, + field_value: &PyAny, + ) -> PyResult { + let field = self.schema.get_field(field_name).ok_or_else(|| { + exceptions::ValueError::py_err(format!( + "Field `{}` is not defined in the schema.", + field_name + )) + })?; + + let value = extract_value(field_value)?; + let term = match value { + Value::Str(text) => Term::from_field_text(field, &text), + Value::U64(num) => Term::from_field_u64(field, num), + Value::I64(num) => Term::from_field_i64(field, num), + Value::F64(num) => Term::from_field_f64(field, num), + Value::Date(d) => Term::from_field_date(field, &d), + Value::Facet(facet) => Term::from_facet(field, &facet), + Value::Bytes(_) => { + return Err(exceptions::ValueError::py_err(format!( + "Field `{}` is bytes type not deletable.", + field_name + ))) + } + }; + Ok(self.inner_index_writer.delete_term(term.clone())) + } } /// Create a new index object. diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index 202f9bd..c6dfc9b 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -5,10 +5,7 @@ from tantivy import Document, Index, SchemaBuilder, Schema def schema(): - return SchemaBuilder() \ - .add_text_field("title", stored=True) \ - .add_text_field("body") \ - .build() + return SchemaBuilder().add_text_field("title", stored=True).add_text_field("body").build() @pytest.fixture(scope="class") @@ -24,39 +21,49 @@ def ram_index(): # create a document instance # add field-value pairs doc.add_text("title", "The Old Man and the Sea") - doc.add_text("body", ("He was an old man who fished alone in a skiff in" - "the Gulf Stream and he had gone eighty-four days " - "now without taking a fish.")) + doc.add_text( + "body", + ( + "He was an old man who fished alone in a skiff in" + "the Gulf Stream and he had gone eighty-four days " + "now without taking a fish." + ), + ) writer.add_document(doc) # 2 use the built-in json support # keys need to coincide with field names - doc = Document.from_dict({ - "title": "Of Mice and Men", - "body": ("A few miles south of Soledad, the Salinas River drops " - "in close to the hillside bank and runs deep and " - "green. The water is warm too, for it has slipped " - "twinkling over the yellow sands in the sunlight " - "before reaching the narrow pool. On one side of the " - "river the golden foothill slopes curve up to the " - "strong and rocky Gabilan Mountains, but on the valley " - "side the water is lined with trees—willows fresh and " - "green with every spring, carrying in their lower leaf " - "junctures the debris of the winter’s flooding; and " - "sycamores with mottled, white, recumbent limbs and " - "branches that arch over the pool") - }) + doc = Document.from_dict( + { + "title": "Of Mice and Men", + "body": ( + "A few miles south of Soledad, the Salinas River drops " + "in close to the hillside bank and runs deep and " + "green. The water is warm too, for it has slipped " + "twinkling over the yellow sands in the sunlight " + "before reaching the narrow pool. On one side of the " + "river the golden foothill slopes curve up to the " + "strong and rocky Gabilan Mountains, but on the valley " + "side the water is lined with trees—willows fresh and " + "green with every spring, carrying in their lower leaf " + "junctures the debris of the winter’s flooding; and " + "sycamores with mottled, white, recumbent limbs and " + "branches that arch over the pool" + ), + } + ) writer.add_document(doc) - writer.add_json("""{ + writer.add_json( + """{ "title": ["Frankenstein", "The Modern Prometheus"], "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." - }""") + }""" + ) writer.commit() index.reload() return index class TestClass(object): - def test_simple_search(self, ram_index): index = ram_index query = index.parse_query("sea whale", ["title", "body"]) @@ -91,10 +98,12 @@ class TestClass(object): def test_and_query_parser_default_fields_undefined(self, ram_index): query = ram_index.parse_query("winter") - assert repr(query) == "Query(BooleanQuery { subqueries: [" \ - "(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " \ - "(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \ - "})" + assert ( + repr(query) == "Query(BooleanQuery { subqueries: [" + "(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " + "(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " + "})" + ) def test_query_errors(self, ram_index): index = ram_index @@ -103,11 +112,33 @@ class TestClass(object): index.parse_query("bod:men", ["title", "body"]) +class TestUpdateClass(object): + def test_delete_update(self, ram_index): + query = ram_index.parse_query("Frankenstein", ["title"]) + top_docs = tantivy.TopDocs(10) + result = ram_index.searcher().search(query, top_docs) + assert len(result) == 1 + + writer = ram_index.writer() + + with pytest.raises(ValueError): + writer.delete_documents("fake_field", "frankenstein") + + with pytest.raises(ValueError): + writer.delete_documents("title", b"frankenstein") + + writer.delete_documents("title", "frankenstein") + writer.commit() + ram_index.reload() + + result = ram_index.searcher().search(query, top_docs) + assert len(result) == 0 + + PATH_TO_INDEX = "tests/test_index/" class TestFromDiskClass(object): - def test_exists(self): # prefer to keep it separate in case anyone deletes this # runs from the root directory @@ -136,6 +167,7 @@ class TestFromDiskClass(object): writer.add_document(Document(title="mytitle2", body="mybody2")) writer.commit() import time + for i in range(50): # The index should be automatically reloaded. # Wait for at most 5s for it to happen. @@ -151,7 +183,6 @@ class TestSearcher(object): class TestDocument(object): - def test_document(self): doc = tantivy.Document(name="Bill", reference=[1, 2]) assert doc["reference"] == [1, 2] @@ -162,7 +193,8 @@ class TestDocument(object): def test_document_with_date(self): import datetime - date = datetime.datetime(2019, 8, 12, 13, 0, 0, ) + + date = datetime.datetime(2019, 8, 12, 13, 0, 0) doc = tantivy.Document(name="Bill", date=date) assert doc["date"][0] == date @@ -174,15 +206,15 @@ class TestDocument(object): doc = tantivy.Document() facet = tantivy.Facet.from_string("/europe/france") doc.add_facet("facet", facet) - assert doc["facet"][0].to_path() == ['europe', 'france'] + assert doc["facet"][0].to_path() == ["europe", "france"] doc = tantivy.Document() facet = tantivy.Facet.from_string("/asia\\/oceania/fiji") doc.add_facet("facet", facet) - assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji'] + assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"] assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji" assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)" doc = tantivy.Document(facet=facet) - assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji'] + assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"] def test_document_error(self): with pytest.raises(ValueError):