Adding delete_document function to index (#12)

* Adding Term, Field and delete_term API mapping

* Fixing test

* Fixing code doc

* Removing Term and Field and nesting code on delete_term

* Fixing lint

* Delete_documents and documentation

* Fixing style and testing exceptions

* Fixing missing return

* Fixing fmt
master
Ramon Navarro Bosch 2019-09-09 14:30:50 +02:00 committed by Paul Masurel
parent 51f080d001
commit 75f0670dd6
3 changed files with 109 additions and 38 deletions

View File

@ -109,7 +109,7 @@ where
.push(Value::from(value)); .push(Value::from(value));
} }
fn extract_value(any: &PyAny) -> PyResult<Value> { pub(crate) fn extract_value(any: &PyAny) -> PyResult<Value> {
if let Ok(s) = any.extract::<String>() { if let Ok(s) = any.extract::<String>() {
return Ok(Value::Str(s)); return Ok(Value::Str(s));
} }

View File

@ -2,15 +2,16 @@
use pyo3::exceptions; use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::PyAny;
use crate::document::Document; use crate::document::{extract_value, Document};
use crate::query::Query; use crate::query::Query;
use crate::schema::Schema; use crate::schema::Schema;
use crate::searcher::Searcher; use crate::searcher::Searcher;
use crate::to_pyerr; use crate::to_pyerr;
use tantivy as tv; use tantivy as tv;
use tantivy::directory::MmapDirectory; use tantivy::directory::MmapDirectory;
use tantivy::schema::{Field, NamedFieldDocument}; use tantivy::schema::{Field, NamedFieldDocument, Term, Value};
const RELOAD_POLICY: &str = "commit"; const RELOAD_POLICY: &str = "commit";
@ -94,6 +95,44 @@ impl IndexWriter {
fn commit_opstamp(&self) -> u64 { fn commit_opstamp(&self) -> u64 {
self.inner_index_writer.commit_opstamp() self.inner_index_writer.commit_opstamp()
} }
/// Delete all documents containing a given term.
///
/// Args:
/// field_name (str): The field name for which we want to filter deleted docs.
/// field_value (PyAny): Python object with the value we want to filter.
///
/// If the field_name is not on the schema raises ValueError exception.
/// If the field_value is not supported raises Exception.
fn delete_documents(
&mut self,
field_name: &str,
field_value: &PyAny,
) -> PyResult<u64> {
let field = self.schema.get_field(field_name).ok_or_else(|| {
exceptions::ValueError::py_err(format!(
"Field `{}` is not defined in the schema.",
field_name
))
})?;
let value = extract_value(field_value)?;
let term = match value {
Value::Str(text) => Term::from_field_text(field, &text),
Value::U64(num) => Term::from_field_u64(field, num),
Value::I64(num) => Term::from_field_i64(field, num),
Value::F64(num) => Term::from_field_f64(field, num),
Value::Date(d) => Term::from_field_date(field, &d),
Value::Facet(facet) => Term::from_facet(field, &facet),
Value::Bytes(_) => {
return Err(exceptions::ValueError::py_err(format!(
"Field `{}` is bytes type not deletable.",
field_name
)))
}
};
Ok(self.inner_index_writer.delete_term(term.clone()))
}
} }
/// Create a new index object. /// Create a new index object.

View File

@ -5,10 +5,7 @@ from tantivy import Document, Index, SchemaBuilder, Schema
def schema(): def schema():
return SchemaBuilder() \ return SchemaBuilder().add_text_field("title", stored=True).add_text_field("body").build()
.add_text_field("title", stored=True) \
.add_text_field("body") \
.build()
@pytest.fixture(scope="class") @pytest.fixture(scope="class")
@ -24,39 +21,49 @@ def ram_index():
# create a document instance # create a document instance
# add field-value pairs # add field-value pairs
doc.add_text("title", "The Old Man and the Sea") doc.add_text("title", "The Old Man and the Sea")
doc.add_text("body", ("He was an old man who fished alone in a skiff in" doc.add_text(
"the Gulf Stream and he had gone eighty-four days " "body",
"now without taking a fish.")) (
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc) writer.add_document(doc)
# 2 use the built-in json support # 2 use the built-in json support
# keys need to coincide with field names # keys need to coincide with field names
doc = Document.from_dict({ doc = Document.from_dict(
"title": "Of Mice and Men", {
"body": ("A few miles south of Soledad, the Salinas River drops " "title": "Of Mice and Men",
"in close to the hillside bank and runs deep and " "body": (
"green. The water is warm too, for it has slipped " "A few miles south of Soledad, the Salinas River drops "
"twinkling over the yellow sands in the sunlight " "in close to the hillside bank and runs deep and "
"before reaching the narrow pool. On one side of the " "green. The water is warm too, for it has slipped "
"river the golden foothill slopes curve up to the " "twinkling over the yellow sands in the sunlight "
"strong and rocky Gabilan Mountains, but on the valley " "before reaching the narrow pool. On one side of the "
"side the water is lined with trees—willows fresh and " "river the golden foothill slopes curve up to the "
"green with every spring, carrying in their lower leaf " "strong and rocky Gabilan Mountains, but on the valley "
"junctures the debris of the winters flooding; and " "side the water is lined with trees—willows fresh and "
"sycamores with mottled, white, recumbent limbs and " "green with every spring, carrying in their lower leaf "
"branches that arch over the pool") "junctures the debris of the winters flooding; and "
}) "sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
}
)
writer.add_document(doc) writer.add_document(doc)
writer.add_json("""{ writer.add_json(
"""{
"title": ["Frankenstein", "The Modern Prometheus"], "title": ["Frankenstein", "The Modern Prometheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}""") }"""
)
writer.commit() writer.commit()
index.reload() index.reload()
return index return index
class TestClass(object): class TestClass(object):
def test_simple_search(self, ram_index): def test_simple_search(self, ram_index):
index = ram_index index = ram_index
query = index.parse_query("sea whale", ["title", "body"]) query = index.parse_query("sea whale", ["title", "body"])
@ -91,10 +98,12 @@ class TestClass(object):
def test_and_query_parser_default_fields_undefined(self, ram_index): def test_and_query_parser_default_fields_undefined(self, ram_index):
query = ram_index.parse_query("winter") query = ram_index.parse_query("winter")
assert repr(query) == "Query(BooleanQuery { subqueries: [" \ assert (
"(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), " \ repr(query) == "Query(BooleanQuery { subqueries: ["
"(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] " \ "(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), "
"})" "(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] "
"})"
)
def test_query_errors(self, ram_index): def test_query_errors(self, ram_index):
index = ram_index index = ram_index
@ -103,11 +112,33 @@ class TestClass(object):
index.parse_query("bod:men", ["title", "body"]) index.parse_query("bod:men", ["title", "body"])
class TestUpdateClass(object):
def test_delete_update(self, ram_index):
query = ram_index.parse_query("Frankenstein", ["title"])
top_docs = tantivy.TopDocs(10)
result = ram_index.searcher().search(query, top_docs)
assert len(result) == 1
writer = ram_index.writer()
with pytest.raises(ValueError):
writer.delete_documents("fake_field", "frankenstein")
with pytest.raises(ValueError):
writer.delete_documents("title", b"frankenstein")
writer.delete_documents("title", "frankenstein")
writer.commit()
ram_index.reload()
result = ram_index.searcher().search(query, top_docs)
assert len(result) == 0
PATH_TO_INDEX = "tests/test_index/" PATH_TO_INDEX = "tests/test_index/"
class TestFromDiskClass(object): class TestFromDiskClass(object):
def test_exists(self): def test_exists(self):
# prefer to keep it separate in case anyone deletes this # prefer to keep it separate in case anyone deletes this
# runs from the root directory # runs from the root directory
@ -136,6 +167,7 @@ class TestFromDiskClass(object):
writer.add_document(Document(title="mytitle2", body="mybody2")) writer.add_document(Document(title="mytitle2", body="mybody2"))
writer.commit() writer.commit()
import time import time
for i in range(50): for i in range(50):
# The index should be automatically reloaded. # The index should be automatically reloaded.
# Wait for at most 5s for it to happen. # Wait for at most 5s for it to happen.
@ -151,7 +183,6 @@ class TestSearcher(object):
class TestDocument(object): class TestDocument(object):
def test_document(self): def test_document(self):
doc = tantivy.Document(name="Bill", reference=[1, 2]) doc = tantivy.Document(name="Bill", reference=[1, 2])
assert doc["reference"] == [1, 2] assert doc["reference"] == [1, 2]
@ -162,7 +193,8 @@ class TestDocument(object):
def test_document_with_date(self): def test_document_with_date(self):
import datetime import datetime
date = datetime.datetime(2019, 8, 12, 13, 0, 0, )
date = datetime.datetime(2019, 8, 12, 13, 0, 0)
doc = tantivy.Document(name="Bill", date=date) doc = tantivy.Document(name="Bill", date=date)
assert doc["date"][0] == date assert doc["date"][0] == date
@ -174,15 +206,15 @@ class TestDocument(object):
doc = tantivy.Document() doc = tantivy.Document()
facet = tantivy.Facet.from_string("/europe/france") facet = tantivy.Facet.from_string("/europe/france")
doc.add_facet("facet", facet) doc.add_facet("facet", facet)
assert doc["facet"][0].to_path() == ['europe', 'france'] assert doc["facet"][0].to_path() == ["europe", "france"]
doc = tantivy.Document() doc = tantivy.Document()
facet = tantivy.Facet.from_string("/asia\\/oceania/fiji") facet = tantivy.Facet.from_string("/asia\\/oceania/fiji")
doc.add_facet("facet", facet) doc.add_facet("facet", facet)
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji'] assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]
assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji" assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji"
assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)" assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)"
doc = tantivy.Document(facet=facet) doc = tantivy.Document(facet=facet)
assert doc["facet"][0].to_path() == ['asia/oceania', 'fiji'] assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]
def test_document_error(self): def test_document_error(self):
with pytest.raises(ValueError): with pytest.raises(ValueError):