396 lines
13 KiB
Python
396 lines
13 KiB
Python
import tantivy
|
||
import pytest
|
||
|
||
from tantivy import Document, Index, SchemaBuilder
|
||
|
||
|
||
def schema():
|
||
return (
|
||
SchemaBuilder()
|
||
.add_text_field("title", stored=True)
|
||
.add_text_field("body")
|
||
.build()
|
||
)
|
||
|
||
|
||
def create_index(dir=None):
|
||
# assume all tests will use the same documents for now
|
||
# other methods may set up function-local indexes
|
||
index = Index(schema(), dir)
|
||
writer = index.writer()
|
||
|
||
# 2 ways of adding documents
|
||
# 1
|
||
doc = Document()
|
||
# create a document instance
|
||
# add field-value pairs
|
||
doc.add_text("title", "The Old Man and the Sea")
|
||
doc.add_text(
|
||
"body",
|
||
(
|
||
"He was an old man who fished alone in a skiff in"
|
||
"the Gulf Stream and he had gone eighty-four days "
|
||
"now without taking a fish."
|
||
),
|
||
)
|
||
writer.add_document(doc)
|
||
# 2 use the built-in json support
|
||
# keys need to coincide with field names
|
||
doc = Document.from_dict(
|
||
{
|
||
"title": "Of Mice and Men",
|
||
"body": (
|
||
"A few miles south of Soledad, the Salinas River drops "
|
||
"in close to the hillside bank and runs deep and "
|
||
"green. The water is warm too, for it has slipped "
|
||
"twinkling over the yellow sands in the sunlight "
|
||
"before reaching the narrow pool. On one side of the "
|
||
"river the golden foothill slopes curve up to the "
|
||
"strong and rocky Gabilan Mountains, but on the valley "
|
||
"side the water is lined with trees—willows fresh and "
|
||
"green with every spring, carrying in their lower leaf "
|
||
"junctures the debris of the winter’s flooding; and "
|
||
"sycamores with mottled, white, recumbent limbs and "
|
||
"branches that arch over the pool"
|
||
),
|
||
}
|
||
)
|
||
writer.add_document(doc)
|
||
writer.add_json(
|
||
"""{
|
||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
||
}"""
|
||
)
|
||
writer.commit()
|
||
index.reload()
|
||
return index
|
||
|
||
|
||
@pytest.fixture()
|
||
def dir_index(tmpdir):
|
||
return (tmpdir, create_index(str(tmpdir)))
|
||
|
||
|
||
@pytest.fixture(scope="class")
|
||
def ram_index():
|
||
return create_index()
|
||
|
||
|
||
class TestClass(object):
|
||
def test_simple_search_in_dir(self, dir_index):
|
||
_, index = dir_index
|
||
query = index.parse_query("sea whale", ["title", "body"])
|
||
|
||
result = index.searcher().search(query, 10)
|
||
assert len(result.hits) == 1
|
||
|
||
def test_simple_search_after_reuse(self, dir_index):
|
||
index_dir, _ = dir_index
|
||
index = Index(schema(), str(index_dir))
|
||
query = index.parse_query("sea whale", ["title", "body"])
|
||
|
||
result = index.searcher().search(query, 10)
|
||
assert len(result.hits) == 1
|
||
|
||
def test_simple_search_in_ram(self, ram_index):
|
||
index = ram_index
|
||
query = index.parse_query("sea whale", ["title", "body"])
|
||
|
||
result = index.searcher().search(query, 10)
|
||
assert len(result.hits) == 1
|
||
_, doc_address = result.hits[0]
|
||
searched_doc = index.searcher().doc(doc_address)
|
||
assert searched_doc["title"] == ["The Old Man and the Sea"]
|
||
|
||
def test_and_query(self, ram_index):
|
||
index = ram_index
|
||
query = index.parse_query(
|
||
"title:men AND body:summer", default_field_names=["title", "body"]
|
||
)
|
||
# look for an intersection of documents
|
||
searcher = index.searcher()
|
||
result = searcher.search(query, 10)
|
||
|
||
# summer isn't present
|
||
assert len(result.hits) == 0
|
||
|
||
query = index.parse_query("title:men AND body:winter", ["title", "body"])
|
||
result = searcher.search(query)
|
||
|
||
assert len(result.hits) == 1
|
||
|
||
def test_and_query_parser_default_fields(self, ram_index):
|
||
query = ram_index.parse_query("winter", default_field_names=["title"])
|
||
assert repr(query) == """Query(TermQuery(Term(type=Str, field=0, "winter")))"""
|
||
|
||
def test_and_query_parser_default_fields_undefined(self, ram_index):
|
||
query = ram_index.parse_query("winter")
|
||
assert (
|
||
repr(query)
|
||
== """Query(BooleanQuery { subqueries: [(Should, TermQuery(Term(type=Str, field=0, "winter"))), (Should, TermQuery(Term(type=Str, field=1, "winter")))] })"""
|
||
)
|
||
|
||
def test_query_errors(self, ram_index):
|
||
index = ram_index
|
||
# no "bod" field
|
||
with pytest.raises(ValueError):
|
||
index.parse_query("bod:men", ["title", "body"])
|
||
|
||
def test_order_by_search(self):
|
||
schema = (
|
||
SchemaBuilder()
|
||
.add_unsigned_field("order", fast="single")
|
||
.add_text_field("title", stored=True)
|
||
.build()
|
||
)
|
||
|
||
index = Index(schema)
|
||
writer = index.writer()
|
||
|
||
doc = Document()
|
||
doc.add_unsigned("order", 0)
|
||
doc.add_text("title", "Test title")
|
||
|
||
writer.add_document(doc)
|
||
|
||
doc = Document()
|
||
doc.add_unsigned("order", 2)
|
||
doc.add_text("title", "Final test title")
|
||
writer.add_document(doc)
|
||
|
||
doc = Document()
|
||
doc.add_unsigned("order", 1)
|
||
doc.add_text("title", "Another test title")
|
||
|
||
writer.add_document(doc)
|
||
|
||
writer.commit()
|
||
index.reload()
|
||
|
||
query = index.parse_query("test")
|
||
|
||
searcher = index.searcher()
|
||
|
||
result = searcher.search(query, 10, offset=2, order_by_field="order")
|
||
|
||
assert len(result.hits) == 1
|
||
|
||
result = searcher.search(query, 10, order_by_field="order")
|
||
|
||
assert len(result.hits) == 3
|
||
|
||
_, doc_address = result.hits[0]
|
||
searched_doc = index.searcher().doc(doc_address)
|
||
assert searched_doc["title"] == ["Final test title"]
|
||
|
||
_, doc_address = result.hits[1]
|
||
searched_doc = index.searcher().doc(doc_address)
|
||
assert searched_doc["title"] == ["Another test title"]
|
||
|
||
_, doc_address = result.hits[2]
|
||
searched_doc = index.searcher().doc(doc_address)
|
||
assert searched_doc["title"] == ["Test title"]
|
||
|
||
def test_order_by_search_without_fast_field(self):
|
||
schema = (
|
||
SchemaBuilder()
|
||
.add_unsigned_field("order")
|
||
.add_text_field("title", stored=True)
|
||
.build()
|
||
)
|
||
|
||
index = Index(schema)
|
||
writer = index.writer()
|
||
|
||
doc = Document()
|
||
doc.add_unsigned("order", 0)
|
||
doc.add_text("title", "Test title")
|
||
|
||
query = index.parse_query("test")
|
||
|
||
searcher = index.searcher()
|
||
result = searcher.search(query, 10, order_by_field="order")
|
||
assert len(result.hits) == 0
|
||
|
||
|
||
class TestUpdateClass(object):
|
||
def test_delete_update(self, ram_index):
|
||
query = ram_index.parse_query("Frankenstein", ["title"])
|
||
result = ram_index.searcher().search(query, 10)
|
||
assert len(result.hits) == 1
|
||
|
||
writer = ram_index.writer()
|
||
|
||
with pytest.raises(ValueError):
|
||
writer.delete_documents("fake_field", "frankenstein")
|
||
|
||
with pytest.raises(ValueError):
|
||
writer.delete_documents("title", b"frankenstein")
|
||
|
||
writer.delete_documents("title", "frankenstein")
|
||
writer.commit()
|
||
ram_index.reload()
|
||
|
||
result = ram_index.searcher().search(query)
|
||
assert len(result.hits) == 0
|
||
|
||
|
||
class TestFromDiskClass(object):
|
||
def test_opens_from_dir_invalid_schema(self, dir_index):
|
||
invalid_schema = SchemaBuilder().add_text_field("🐱").build()
|
||
index_dir, _ = dir_index
|
||
with pytest.raises(ValueError):
|
||
Index(invalid_schema, str(index_dir), reuse=True)
|
||
|
||
def test_opens_from_dir(self, dir_index):
|
||
index_dir, _ = dir_index
|
||
|
||
index = Index(schema(), str(index_dir), reuse=True)
|
||
assert index.searcher().num_docs == 3
|
||
|
||
def test_create_readers(self):
|
||
# not sure what is the point of this test.
|
||
idx = Index(schema())
|
||
idx.config_reader("Manual", 4)
|
||
assert idx.searcher().num_docs == 0
|
||
# by default this is manual mode
|
||
writer = idx.writer(30000000, 1)
|
||
writer.add_document(Document(title="mytitle", body="mybody"))
|
||
writer.commit()
|
||
assert idx.searcher().num_docs == 0
|
||
# Manual is the default setting.
|
||
# In this case, change are reflected only when
|
||
# the index is manually reloaded.
|
||
idx.reload()
|
||
assert idx.searcher().num_docs == 1
|
||
idx.config_reader("OnCommit", 4)
|
||
writer.add_document(Document(title="mytitle2", body="mybody2"))
|
||
writer.commit()
|
||
import time
|
||
|
||
for i in range(50):
|
||
# The index should be automatically reloaded.
|
||
# Wait for at most 5s for it to happen.
|
||
time.sleep(0.1)
|
||
if idx.searcher().num_docs == 2:
|
||
return
|
||
assert False
|
||
|
||
|
||
class TestSearcher(object):
|
||
def test_searcher_repr(self, ram_index):
|
||
assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)"
|
||
|
||
|
||
class TestDocument(object):
|
||
def test_document(self):
|
||
doc = tantivy.Document(name="Bill", reference=[1, 2])
|
||
assert doc["reference"] == [1, 2]
|
||
assert doc["name"] == ["Bill"]
|
||
assert doc.get_first("name") == "Bill"
|
||
assert doc.get_first("reference") == 1
|
||
assert doc.to_dict() == {"name": ["Bill"], "reference": [1, 2]}
|
||
|
||
def test_document_with_date(self):
|
||
import datetime
|
||
|
||
date = datetime.datetime(2019, 8, 12, 13, 0, 0)
|
||
doc = tantivy.Document(name="Bill", date=date)
|
||
assert doc["date"][0] == date
|
||
|
||
def test_document_repr(self):
|
||
doc = tantivy.Document(name="Bill", reference=[1, 2])
|
||
assert repr(doc) == "Document(name=[Bill],reference=[1,2])"
|
||
|
||
def test_document_repr_utf8(self):
|
||
doc = tantivy.Document(name="野菜食べないとやばい", reference=[1, 2])
|
||
assert repr(doc) == "Document(name=[野菜食べないとやばい],reference=[1,2])"
|
||
|
||
def test_document_with_facet(self):
|
||
doc = tantivy.Document()
|
||
facet = tantivy.Facet.from_string("/europe/france")
|
||
doc.add_facet("facet", facet)
|
||
assert doc["facet"][0].to_path() == ["europe", "france"]
|
||
doc = tantivy.Document()
|
||
facet = tantivy.Facet.from_string("/asia\\/oceania/fiji")
|
||
doc.add_facet("facet", facet)
|
||
assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]
|
||
assert doc["facet"][0].to_path_str() == "/asia\\/oceania/fiji"
|
||
assert repr(doc["facet"][0]) == "Facet(/asia\\/oceania/fiji)"
|
||
doc = tantivy.Document(facet=facet)
|
||
assert doc["facet"][0].to_path() == ["asia/oceania", "fiji"]
|
||
|
||
def test_document_error(self):
|
||
with pytest.raises(ValueError):
|
||
tantivy.Document(name={})
|
||
|
||
|
||
class TestJsonField:
|
||
def test_query_from_json_field(self):
|
||
schema = (
|
||
SchemaBuilder()
|
||
.add_json_field(
|
||
"attributes",
|
||
stored=True,
|
||
tokenizer_name="default",
|
||
index_option="position",
|
||
)
|
||
.build()
|
||
)
|
||
|
||
index = Index(schema)
|
||
|
||
writer = index.writer()
|
||
|
||
doc = Document()
|
||
doc.add_json(
|
||
"attributes",
|
||
"""{
|
||
"order":1.1,
|
||
"target": "submit-button",
|
||
"cart": {"product_id": 103},
|
||
"description": "the best vacuum cleaner ever"
|
||
}""",
|
||
)
|
||
|
||
writer.add_document(doc)
|
||
|
||
doc = Document()
|
||
doc.add_json(
|
||
"attributes",
|
||
"""{
|
||
"order":1.2,
|
||
"target": "submit-button",
|
||
"cart": {"product_id": 133},
|
||
"description": "das keyboard"
|
||
}""",
|
||
)
|
||
|
||
writer.add_document(doc)
|
||
|
||
writer.commit()
|
||
index.reload()
|
||
|
||
query = index.parse_query("target:submit-button", ["attributes"])
|
||
result = index.searcher().search(query, 2)
|
||
assert len(result.hits) == 2
|
||
|
||
query = index.parse_query("target:submit", ["attributes"])
|
||
result = index.searcher().search(query, 2)
|
||
assert len(result.hits) == 2
|
||
|
||
query = index.parse_query("order:1.1", ["attributes"])
|
||
result = index.searcher().search(query, 2)
|
||
assert len(result.hits) == 1
|
||
|
||
# query = index.parse_query_for_attributes("cart.product_id:103")
|
||
# result = index.searcher().search(query, 1)
|
||
# assert len(result.hits) == 1
|
||
|
||
# query = index.parse_query_for_attributes(
|
||
# "target:submit-button AND cart.product_id:133"
|
||
# )
|
||
# result = index.searcher().search(query, 2)
|
||
# assert len(result.hits) == 1
|