2019-06-04 09:09:58 +00:00
|
|
|
|
import json
|
|
|
|
|
import tantivy
|
|
|
|
|
|
2019-08-02 21:32:08 +00:00
|
|
|
|
import pytest
|
|
|
|
|
|
2019-06-04 09:09:58 +00:00
|
|
|
|
|
|
|
|
|
class TestClass(object):
|
2019-08-02 21:32:08 +00:00
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def setup_class(cls):
|
|
|
|
|
# assume all tests will use the same documents for now
|
|
|
|
|
# other methods may set up function-local indexes
|
2019-06-04 09:09:58 +00:00
|
|
|
|
builder = tantivy.SchemaBuilder()
|
|
|
|
|
|
|
|
|
|
title = builder.add_text_field("title", stored=True)
|
|
|
|
|
body = builder.add_text_field("body")
|
|
|
|
|
|
|
|
|
|
schema = builder.build()
|
|
|
|
|
index = tantivy.Index(schema)
|
|
|
|
|
|
|
|
|
|
writer = index.writer()
|
|
|
|
|
|
2019-08-02 21:32:08 +00:00
|
|
|
|
# 2 ways of adding documents
|
|
|
|
|
# 1
|
2019-06-04 09:09:58 +00:00
|
|
|
|
doc = tantivy.Document()
|
2019-08-02 21:32:08 +00:00
|
|
|
|
# create a document instance
|
|
|
|
|
# add field-value pairs
|
2019-06-04 09:09:58 +00:00
|
|
|
|
doc.add_text(title, "The Old Man and the Sea")
|
|
|
|
|
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
|
|
|
|
|
"the Gulf Stream and he had gone eighty-four days "
|
|
|
|
|
"now without taking a fish."))
|
|
|
|
|
writer.add_document(doc)
|
2019-08-02 21:32:08 +00:00
|
|
|
|
# 2 use the built-in json support
|
|
|
|
|
# keys need to coincide with field names
|
2019-06-04 09:09:58 +00:00
|
|
|
|
doc = schema.parse_document(json.dumps({
|
|
|
|
|
"title": "Of Mice and Men",
|
|
|
|
|
"body": ("A few miles south of Soledad, the Salinas River drops "
|
|
|
|
|
"in close to the hillside bank and runs deep and "
|
|
|
|
|
"green. The water is warm too, for it has slipped "
|
|
|
|
|
"twinkling over the yellow sands in the sunlight "
|
|
|
|
|
"before reaching the narrow pool. On one side of the "
|
|
|
|
|
"river the golden foothill slopes curve up to the "
|
|
|
|
|
"strong and rocky Gabilan Mountains, but on the valley "
|
|
|
|
|
"side the water is lined with trees—willows fresh and "
|
|
|
|
|
"green with every spring, carrying in their lower leaf "
|
|
|
|
|
"junctures the debris of the winter’s flooding; and "
|
|
|
|
|
"sycamores with mottled, white, recumbent limbs and "
|
|
|
|
|
"branches that arch over the pool")
|
|
|
|
|
}))
|
|
|
|
|
|
|
|
|
|
writer.add_document(doc)
|
|
|
|
|
|
|
|
|
|
doc = schema.parse_document(json.dumps({
|
|
|
|
|
"title": ["Frankenstein", "The Modern Prometheus"],
|
|
|
|
|
"body": ("You will rejoice to hear that no disaster has "
|
|
|
|
|
"accompanied the commencement of an enterprise which you "
|
|
|
|
|
"have regarded with such evil forebodings. I arrived "
|
|
|
|
|
"here yesterday, and my first task is to assure my dear "
|
|
|
|
|
"sister of my welfare and increasing confidence in the "
|
|
|
|
|
"success of my undertaking.")
|
|
|
|
|
}))
|
|
|
|
|
|
|
|
|
|
writer.add_document(doc)
|
|
|
|
|
writer.commit()
|
|
|
|
|
|
2019-08-02 21:32:08 +00:00
|
|
|
|
cls.reader = index.reader()
|
|
|
|
|
cls.searcher = cls.reader.searcher()
|
|
|
|
|
cls.index = index
|
|
|
|
|
cls.schema = schema
|
|
|
|
|
cls.default_args = [title, body]
|
|
|
|
|
cls.title = title
|
|
|
|
|
cls.body = body
|
2019-06-04 09:09:58 +00:00
|
|
|
|
|
2019-08-02 21:32:08 +00:00
|
|
|
|
def test_simple_search(self):
|
|
|
|
|
query_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
|
2019-06-04 09:09:58 +00:00
|
|
|
|
query = query_parser.parse_query("sea whale")
|
|
|
|
|
|
|
|
|
|
top_docs = tantivy.TopDocs(10)
|
|
|
|
|
|
2019-08-02 21:32:08 +00:00
|
|
|
|
result = self.searcher.search(query, top_docs)
|
2019-06-04 09:09:58 +00:00
|
|
|
|
print(result)
|
|
|
|
|
|
|
|
|
|
assert len(result) == 1
|
|
|
|
|
|
|
|
|
|
_, doc_address = result[0]
|
|
|
|
|
|
2019-08-02 21:32:08 +00:00
|
|
|
|
searched_doc = self.searcher.doc(doc_address)
|
|
|
|
|
assert searched_doc.get_first(self.title) == "The Old Man and the Sea"
|
2019-06-04 09:09:58 +00:00
|
|
|
|
|
|
|
|
|
def test_doc(self):
|
|
|
|
|
builder = tantivy.SchemaBuilder()
|
|
|
|
|
title = builder.add_text_field("title", stored=True)
|
|
|
|
|
|
|
|
|
|
doc = tantivy.Document()
|
|
|
|
|
assert doc.is_empty
|
|
|
|
|
|
|
|
|
|
doc.add_text(title, "The Old Man and the Sea")
|
|
|
|
|
|
|
|
|
|
assert doc.get_first(title) == "The Old Man and the Sea"
|
|
|
|
|
|
|
|
|
|
assert doc.len == 1
|
|
|
|
|
assert not doc.is_empty
|
2019-08-02 21:32:08 +00:00
|
|
|
|
|
|
|
|
|
def test_and_query(self):
|
|
|
|
|
q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
|
|
|
|
|
# look for an intersection of documents
|
|
|
|
|
query = q_parser.parse_query("title:men AND body:summer")
|
|
|
|
|
top_docs = tantivy.TopDocs(10)
|
|
|
|
|
|
|
|
|
|
result = self.searcher.search(query, top_docs)
|
|
|
|
|
print(result)
|
|
|
|
|
|
|
|
|
|
# summer isn't present
|
|
|
|
|
assert len(result) == 0
|
|
|
|
|
|
|
|
|
|
query = q_parser.parse_query("title:men AND body:winter")
|
|
|
|
|
result = self.searcher.search(query, top_docs)
|
|
|
|
|
|
|
|
|
|
assert len(result) == 1
|
|
|
|
|
|
|
|
|
|
def test_query_errors(self):
|
|
|
|
|
q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
|
|
|
|
|
# no "bod" field
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
q_parser.parse_query("bod:title")
|
2019-08-02 23:39:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestFromDiskClass(object):
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def setup_class(cls):
|
|
|
|
|
builder = tantivy.SchemaBuilder()
|
|
|
|
|
title = builder.add_text_field("title", stored=True)
|
|
|
|
|
body = builder.add_text_field("body")
|
|
|
|
|
cls.default_args = [title, body]
|
|
|
|
|
schema = builder.build()
|
|
|
|
|
cls.schema = schema
|
|
|
|
|
cls.index = tantivy.Index(schema)
|
2019-08-02 23:55:31 +00:00
|
|
|
|
cls.path_to_index = "tests/test_index/"
|
2019-08-02 23:39:02 +00:00
|
|
|
|
|
|
|
|
|
def test_exists(self):
|
|
|
|
|
# prefer to keep it separate in case anyone deletes this
|
|
|
|
|
# runs from the root directory
|
2019-08-02 23:55:31 +00:00
|
|
|
|
assert self.index.exists(self.path_to_index)
|
2019-08-02 23:39:02 +00:00
|
|
|
|
|
|
|
|
|
def test_opens_from_dir(self):
|
2019-08-02 23:55:31 +00:00
|
|
|
|
tantivy.Index(self.schema, self.path_to_index)
|
2019-08-02 23:39:02 +00:00
|
|
|
|
|
|
|
|
|
def test_create_readers(self):
|
2019-08-02 23:55:31 +00:00
|
|
|
|
idx = tantivy.Index(self.schema, self.path_to_index)
|
2019-08-02 23:39:02 +00:00
|
|
|
|
reload_policy = "OnCommit" # or "Manual"
|
|
|
|
|
assert idx.reader(reload_policy, 4)
|
|
|
|
|
assert idx.reader("Manual", 4)
|
|
|
|
|
|
|
|
|
|
def test_create_writer_and_reader(self):
|
2019-08-02 23:55:31 +00:00
|
|
|
|
idx = tantivy.Index(self.schema, self.path_to_index)
|
2019-08-02 23:39:02 +00:00
|
|
|
|
writer = idx.writer()
|
|
|
|
|
reload_policy = "OnCommit" # or "Manual"
|
|
|
|
|
reader = idx.reader(reload_policy, 4)
|
|
|
|
|
|
|
|
|
|
# check against the opstamp in the meta file
|
2019-08-02 23:55:31 +00:00
|
|
|
|
meta_fname = "meta.json"
|
|
|
|
|
with open("{}{}".format(self.path_to_index, meta_fname)) as f:
|
|
|
|
|
json_file = json.load(f)
|
|
|
|
|
expected_last_opstamp = json_file["opstamp"]
|
|
|
|
|
# ASSUMPTION
|
|
|
|
|
# We haven't had any deletes in the index
|
|
|
|
|
# so max_doc per index coincides with the value of `num_docs`
|
|
|
|
|
# summing them in all segments, gives the number of documents
|
|
|
|
|
expected_num_docs = sum([segment["max_doc"]
|
|
|
|
|
for segment in json_file["segments"]])
|
2019-08-02 23:39:02 +00:00
|
|
|
|
assert writer.commit_opstamp == expected_last_opstamp
|
|
|
|
|
|
|
|
|
|
q_parser = tantivy.QueryParser.for_index(idx, self.default_args)
|
|
|
|
|
# get all documents
|
|
|
|
|
query = q_parser.parse_query("*")
|
|
|
|
|
top_docs = tantivy.TopDocs(10)
|
|
|
|
|
|
|
|
|
|
docs = reader.searcher().search(query, top_docs)
|
|
|
|
|
for (_score, doc_addr) in docs:
|
|
|
|
|
print(reader.searcher().doc(doc_addr))
|
2019-08-02 23:55:31 +00:00
|
|
|
|
assert expected_num_docs == len(docs)
|