tantivy-py/tests/tantivy_test.py

246 lines
8.7 KiB
Python
Raw Normal View History

import tantivy
import pytest
2019-08-02 11:23:10 +00:00
from tantivy import Document, Index, SchemaBuilder, Schema
2019-08-02 11:23:10 +00:00
def schema():
return SchemaBuilder().add_text_field("title", stored=True).add_text_field("body").build()
def create_index(dir=None):
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(schema(), dir)
writer = index.writer()
# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "The Old Man and the Sea")
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "Of Mice and Men",
"body": (
"A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "The Modern Prometheus"],
2019-08-02 11:23:10 +00:00
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"""
)
writer.commit()
index.reload()
return index
@pytest.fixture()
def dir_index(tmpdir):
return (tmpdir, create_index(str(tmpdir)))
@pytest.fixture(scope="class")
def ram_index():
return create_index()
class TestClass(object):
def test_simple_search_in_dir(self, dir_index):
_, index = dir_index
query = index.parse_query("sea whale", ["title", "body"])
result = index.searcher().search(query, 10)
assert len(result.hits) == 1
def test_simple_search_after_reuse(self, dir_index):
index_dir, _ = dir_index
index = Index(schema(), str(index_dir))
query = index.parse_query("sea whale", ["title", "body"])
result = index.searcher().search(query, 10)
assert len(result.hits) == 1
def test_simple_search_in_ram(self, ram_index):
2019-08-02 11:23:10 +00:00
index = ram_index
query = index.parse_query("sea whale", ["title", "body"])
result = index.searcher().search(query, 10)
assert len(result.hits) == 1
_, doc_address = result.hits[0]
2019-08-02 11:23:10 +00:00
searched_doc = index.searcher().doc(doc_address)
assert searched_doc["title"] == ["The Old Man and the Sea"]
def test_and_query(self, ram_index):
2019-08-02 11:23:10 +00:00
index = ram_index
query = index.parse_query("title:men AND body:summer", default_field_names=["title", "body"])
# look for an intersection of documents
2019-08-02 11:23:10 +00:00
searcher = index.searcher()
result = searcher.search(query, 10)
# summer isn't present
assert len(result.hits) == 0
2019-08-02 11:23:10 +00:00
query = index.parse_query("title:men AND body:winter", ["title", "body"])
result = searcher.search(query)
assert len(result.hits) == 1
2019-08-02 11:23:10 +00:00
def test_and_query_parser_default_fields(self, ram_index):
query = ram_index.parse_query("winter", default_field_names=["title"])
assert repr(query) == """Query(TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114])))"""
def test_and_query_parser_default_fields_undefined(self, ram_index):
query = ram_index.parse_query("winter")
assert (
repr(query) == "Query(BooleanQuery { subqueries: ["
"(Should, TermQuery(Term(field=0,bytes=[119, 105, 110, 116, 101, 114]))), "
"(Should, TermQuery(Term(field=1,bytes=[119, 105, 110, 116, 101, 114])))] "
"})"
)
2019-08-02 11:23:10 +00:00
def test_query_errors(self, ram_index):
2019-08-02 11:23:10 +00:00
index = ram_index
# no "bod" field
with pytest.raises(ValueError):
2019-08-02 11:23:10 +00:00
index.parse_query("bod:men", ["title", "body"])
class TestUpdateClass(object):
def test_delete_update(self, ram_index):
query = ram_index.parse_query("Frankenstein", ["title"])
result = ram_index.searcher().search(query, 10)
assert len(result.hits) == 1
writer = ram_index.writer()
with pytest.raises(ValueError):
writer.delete_documents("fake_field", "frankenstein")
with pytest.raises(ValueError):
writer.delete_documents("title", b"frankenstein")
writer.delete_documents("title", "frankenstein")
writer.commit()
ram_index.reload()
result = ram_index.searcher().search(query)
assert len(result.hits) == 0
2019-08-02 11:23:10 +00:00
PATH_TO_INDEX = "tests/test_index/"
class TestFromDiskClass(object):
2019-08-02 11:23:10 +00:00
def test_exists(self):
# prefer to keep it separate in case anyone deletes this
# runs from the root directory
2019-08-02 11:23:10 +00:00
assert Index.exists(PATH_TO_INDEX)
def test_opens_from_dir_invalid_schema(self):
with pytest.raises(ValueError):
index = Index(schema(), PATH_TO_INDEX, reuse=True)
def test_opens_from_dir(self, dir_index):
index_dir, _ = dir_index
index = Index(schema(), str(index_dir), reuse=True)
2019-08-02 11:23:10 +00:00
assert index.searcher().num_docs == 3
def test_create_readers(self):
# not sure what is the point of this test.
idx = Index(schema())
idx.config_reader("Manual", 4)
assert idx.searcher().num_docs == 0
2019-08-02 11:23:10 +00:00
# by default this is manual mode
writer = idx.writer(30000000, 1)
writer.add_document(Document(title="mytitle", body="mybody"))
2019-08-02 11:23:10 +00:00
writer.commit()
assert idx.searcher().num_docs == 0
2019-08-02 11:23:10 +00:00
# Manual is the default setting.
# In this case, change are reflected only when
# the index is manually reloaded.
idx.reload()
assert idx.searcher().num_docs == 1
2019-08-02 11:23:10 +00:00
idx.config_reader("OnCommit", 4)
writer.add_document(Document(title="mytitle2", body="mybody2"))
2019-08-02 11:23:10 +00:00
writer.commit()
import time
2019-08-02 11:23:10 +00:00
for i in range(50):
# The index should be automatically reloaded.
# Wait for at most 5s for it to happen.
time.sleep(0.1)
if idx.searcher().num_docs == 2:
2019-08-02 11:23:10 +00:00
return
assert False
2019-08-02 11:23:10 +00:00
class TestSearcher(object):
def test_searcher_repr(self, ram_index):
assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)"
class TestDocument(object):
def test_document(self):
doc = tantivy.Document(name="Bill", reference=[1, 2])
assert doc["reference"] == [1, 2]
assert doc["name"] == ["Bill"]
assert doc.get_first("name") == "Bill"
assert doc.get_first("reference") == 1
assert doc.to_dict() == {"name": ["Bill"], "reference": [1, 2]}
def test_document_with_date(self):
import datetime
date = datetime.datetime(2019, 8, 12, 13, 0, 0)
2019-08-02 11:23:10 +00:00
doc = tantivy.Document(name="Bill", date=date)
assert doc["date"][0] == date
def test_document_repr(self):
doc = tantivy.Document(name="Bill", reference=[1, 2])
assert repr(doc) == "Document(name=[Bill],reference=[1,2])"
def test_document_with_facet(self):
doc = tantivy.Document()
facet = tantivy.Facet.from_string("/europe/france")
doc.add_facet("facet", facet)
assert doc["facet"][0].to_path() == ["europe", "france"]
2019-08-02 11:23:10 +00:00
doc = tantivy.Document()
facet = tantivy.Facet.from_string("/asia\\/oceania/fiji")
doc.add_facet("facet", facet)
assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]
2019-08-02 11:23:10 +00:00
assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji"
assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)"
doc = tantivy.Document(facet=facet)
assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]
2019-08-02 11:23:10 +00:00
def test_document_error(self):
with pytest.raises(ValueError):
tantivy.Document(name={})