tantivy-py/tests/tantivy_test.py

183 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import json
import tantivy
import pytest
class TestClass(object):
@classmethod
def setup_class(cls):
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
body = builder.add_text_field("body")
schema = builder.build()
index = tantivy.Index(schema)
writer = index.writer()
# 2 ways of adding documents
# 1
doc = tantivy.Document()
# create a document instance
# add field-value pairs
doc.add_text(title, "The Old Man and the Sea")
doc.add_text(body, ("He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."))
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = schema.parse_document(json.dumps({
"title": "Of Mice and Men",
"body": ("A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool")
}))
writer.add_document(doc)
doc = schema.parse_document(json.dumps({
"title": ["Frankenstein", "The Modern Prometheus"],
"body": ("You will rejoice to hear that no disaster has "
"accompanied the commencement of an enterprise which you "
"have regarded with such evil forebodings. I arrived "
"here yesterday, and my first task is to assure my dear "
"sister of my welfare and increasing confidence in the "
"success of my undertaking.")
}))
writer.add_document(doc)
writer.commit()
cls.reader = index.reader()
cls.searcher = cls.reader.searcher()
cls.index = index
cls.schema = schema
cls.default_args = [title, body]
cls.title = title
cls.body = body
def test_simple_search(self):
query_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
query = query_parser.parse_query("sea whale")
top_docs = tantivy.TopDocs(10)
result = self.searcher.search(query, top_docs)
print(result)
assert len(result) == 1
_, doc_address = result[0]
searched_doc = self.searcher.doc(doc_address)
assert searched_doc.get_first(self.title) == "The Old Man and the Sea"
def test_doc(self):
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
doc = tantivy.Document()
assert doc.is_empty
doc.add_text(title, "The Old Man and the Sea")
assert doc.get_first(title) == "The Old Man and the Sea"
assert doc.len == 1
assert not doc.is_empty
def test_and_query(self):
q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
# look for an intersection of documents
query = q_parser.parse_query("title:men AND body:summer")
top_docs = tantivy.TopDocs(10)
result = self.searcher.search(query, top_docs)
print(result)
# summer isn't present
assert len(result) == 0
query = q_parser.parse_query("title:men AND body:winter")
result = self.searcher.search(query, top_docs)
assert len(result) == 1
def test_query_errors(self):
q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
# no "bod" field
with pytest.raises(ValueError):
q_parser.parse_query("bod:title")
class TestFromDiskClass(object):
@classmethod
def setup_class(cls):
builder = tantivy.SchemaBuilder()
title = builder.add_text_field("title", stored=True)
body = builder.add_text_field("body")
cls.default_args = [title, body]
schema = builder.build()
cls.schema = schema
cls.index = tantivy.Index(schema)
cls.path_to_index = "tests/test_index/"
def test_exists(self):
# prefer to keep it separate in case anyone deletes this
# runs from the root directory
assert self.index.exists(self.path_to_index)
def test_opens_from_dir(self):
tantivy.Index(self.schema, self.path_to_index)
def test_create_readers(self):
idx = tantivy.Index(self.schema, self.path_to_index)
reload_policy = "OnCommit" # or "Manual"
assert idx.reader(reload_policy, 4)
assert idx.reader("Manual", 4)
def test_create_writer_and_reader(self):
idx = tantivy.Index(self.schema, self.path_to_index)
writer = idx.writer()
reload_policy = "OnCommit" # or "Manual"
reader = idx.reader(reload_policy, 4)
# check against the opstamp in the meta file
meta_fname = "meta.json"
with open("{}{}".format(self.path_to_index, meta_fname)) as f:
json_file = json.load(f)
expected_last_opstamp = json_file["opstamp"]
# ASSUMPTION
# We haven't had any deletes in the index
# so max_doc per index coincides with the value of `num_docs`
# summing them in all segments, gives the number of documents
expected_num_docs = sum([segment["max_doc"]
for segment in json_file["segments"]])
assert writer.commit_opstamp == expected_last_opstamp
q_parser = tantivy.QueryParser.for_index(idx, self.default_args)
# get all documents
query = q_parser.parse_query("*")
top_docs = tantivy.TopDocs(10)
docs = reader.searcher().search(query, top_docs)
for (_score, doc_addr) in docs:
print(reader.searcher().doc(doc_addr))
assert expected_num_docs == len(docs)