tantivy-py/tests/tantivy_test.py

import json
import tantivy

import pytest


class TestClass(object):

    @classmethod
    def setup_class(cls):
        # assume all tests will use the same documents for now
        # other methods may set up function-local indexes
        builder = tantivy.SchemaBuilder()

        title = builder.add_text_field("title", stored=True)
        body = builder.add_text_field("body")

        schema = builder.build()
        index = tantivy.Index(schema)

        writer = index.writer()

        # 2 ways of adding documents
        # 1
        doc = tantivy.Document()
        # create a document instance
        # add field-value pairs
        doc.add_text(title, "The Old Man and the Sea")
        doc.add_text(body, ("He was an old man who fished alone in a skiff in"
                            "the Gulf Stream and he had gone eighty-four days "
                            "now without taking a fish."))
        writer.add_document(doc)
        # 2 use the built-in json support
        # keys need to coincide with field names
        doc = schema.parse_document(json.dumps({
            "title": "Of Mice and Men",
            "body": ("A few miles south of Soledad, the Salinas River drops "
                     "in close to the hillside bank and runs deep and "
                     "green. The water is warm too, for it has slipped "
                     "twinkling over the yellow sands in the sunlight "
                     "before reaching the narrow pool. On one side of the "
                     "river the golden foothill slopes curve up to the "
                     "strong and rocky Gabilan Mountains, but on the valley "
                     "side the water is lined with trees—willows fresh and "
                     "green with every spring, carrying in their lower leaf "
                     "junctures the debris of the winter’s flooding; and "
                     "sycamores with mottled, white, recumbent limbs and "
                     "branches that arch over the pool")
        }))

        writer.add_document(doc)

        doc = schema.parse_document(json.dumps({
            "title": ["Frankenstein", "The Modern Prometheus"],
            "body": ("You will rejoice to hear that no disaster has "
                     "accompanied the commencement of an enterprise which you "
                     "have regarded with such evil forebodings.  I arrived "
                     "here yesterday, and my first task is to assure my dear "
                     "sister of my welfare and increasing confidence in the "
                     "success of my undertaking.")
        }))

        writer.add_document(doc)
        writer.commit()

        cls.reader = index.reader()
        cls.searcher = cls.reader.searcher()
        cls.index = index
        cls.schema = schema
        cls.default_args = [title, body]
        cls.title = title
        cls.body = body

    def test_simple_search(self):
        query_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
        query = query_parser.parse_query("sea whale")

        top_docs = tantivy.TopDocs(10)

        result = self.searcher.search(query, top_docs)
        print(result)

        assert len(result) == 1

        _, doc_address = result[0]

        searched_doc = self.searcher.doc(doc_address)
        assert searched_doc.get_first(self.title) == "The Old Man and the Sea"

    def test_doc(self):
        builder = tantivy.SchemaBuilder()
        title = builder.add_text_field("title", stored=True)

        doc = tantivy.Document()
        assert doc.is_empty

        doc.add_text(title, "The Old Man and the Sea")

        assert doc.get_first(title) == "The Old Man and the Sea"

        assert doc.len == 1
        assert not doc.is_empty

    def test_and_query(self):
        q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
        # look for an intersection of documents
        query = q_parser.parse_query("title:men AND body:summer")
        top_docs = tantivy.TopDocs(10)

        result = self.searcher.search(query, top_docs)
        print(result)

        # summer isn't present
        assert len(result) == 0

        query = q_parser.parse_query("title:men AND body:winter")
        result = self.searcher.search(query, top_docs)

        assert len(result) == 1

    def test_query_errors(self):
        q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
        # no "bod" field
        with pytest.raises(ValueError):
            q_parser.parse_query("bod:title")


class TestFromDiskClass(object):

    @classmethod
    def setup_class(cls):
        builder = tantivy.SchemaBuilder()
        title = builder.add_text_field("title", stored=True)
        body = builder.add_text_field("body")
        cls.default_args = [title, body]
        schema = builder.build()
        cls.schema = schema
        cls.index = tantivy.Index(schema)
        cls.path_to_index = "tests/test_index/"

    def test_exists(self):
        # prefer to keep it separate in case anyone deletes this
        # runs from the root directory
        assert self.index.exists(self.path_to_index)

    def test_opens_from_dir(self):
        tantivy.Index(self.schema, self.path_to_index)

    def test_create_readers(self):
        idx = tantivy.Index(self.schema, self.path_to_index)
        reload_policy = "OnCommit"  # or "Manual"
        assert idx.reader(reload_policy, 4)
        assert idx.reader("Manual", 4)

    def test_create_writer_and_reader(self):
        idx = tantivy.Index(self.schema, self.path_to_index)
        writer = idx.writer()
        reload_policy = "OnCommit"  # or "Manual"
        reader = idx.reader(reload_policy, 4)

        # check against the opstamp in the meta file
        meta_fname = "meta.json"
        with open("{}{}".format(self.path_to_index, meta_fname)) as f:
            json_file = json.load(f)
            expected_last_opstamp = json_file["opstamp"]
            # ASSUMPTION
            # We haven't had any deletes in the index
            # so max_doc per index coincides with the value of `num_docs`
            # summing them in all segments, gives the number of documents
            expected_num_docs = sum([segment["max_doc"]
                                     for segment in json_file["segments"]])
        assert writer.commit_opstamp == expected_last_opstamp

        q_parser = tantivy.QueryParser.for_index(idx, self.default_args)
        # get all documents
        query = q_parser.parse_query("*")
        top_docs = tantivy.TopDocs(10)

        docs = reader.searcher().search(query, top_docs)
        for (_score, doc_addr) in docs:
            print(reader.searcher().doc(doc_addr))
        assert expected_num_docs == len(docs)
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
+								import json
 								import tantivy
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								import pytest
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
 								class TestClass(object):
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
 								    @classmethod
 								    def setup_class(cls):
 								        # assume all tests will use the same documents for now
 								        # other methods may set up function-local indexes
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
+								        builder = tantivy.SchemaBuilder()
 								        title = builder.add_text_field("title", stored=True)
 								        body = builder.add_text_field("body")
 								        schema = builder.build()
 								        index = tantivy.Index(schema)
 								        writer = index.writer()
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								        # 2 ways of adding documents
 								        # 1
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
+								        doc = tantivy.Document()
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								        # create a document instance
 								        # add field-value pairs
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
+								        doc.add_text(title, "The Old Man and the Sea")
 								        doc.add_text(body, ("He was an old man who fished alone in a skiff in"
 								                            "the Gulf Stream and he had gone eighty-four days "
 								                            "now without taking a fish."))
 								        writer.add_document(doc)
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								        # 2 use the built-in json support
 								        # keys need to coincide with field names
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
+								        doc = schema.parse_document(json.dumps({
 								            "title": "Of Mice and Men",
 								            "body": ("A few miles south of Soledad, the Salinas River drops "
 								                     "in close to the hillside bank and runs deep and "
 								                     "green. The water is warm too, for it has slipped "
 								                     "twinkling over the yellow sands in the sunlight "
 								                     "before reaching the narrow pool. On one side of the "
 								                     "river the golden foothill slopes curve up to the "
 								                     "strong and rocky Gabilan Mountains, but on the valley "
 								                     "side the water is lined with trees—willows fresh and "
 								                     "green with every spring, carrying in their lower leaf "
 								                     "junctures the debris of the winter’s flooding; and "
 								                     "sycamores with mottled, white, recumbent limbs and "
 								                     "branches that arch over the pool")
 								        }))
 								        writer.add_document(doc)
 								        doc = schema.parse_document(json.dumps({
 								            "title": ["Frankenstein", "The Modern Prometheus"],
 								            "body": ("You will rejoice to hear that no disaster has "
 								                     "accompanied the commencement of an enterprise which you "
 								                     "have regarded with such evil forebodings.  I arrived "
 								                     "here yesterday, and my first task is to assure my dear "
 								                     "sister of my welfare and increasing confidence in the "
 								                     "success of my undertaking.")
 								        }))
 								        writer.add_document(doc)
 								        writer.commit()
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								        cls.reader = index.reader()
 								        cls.searcher = cls.reader.searcher()
 								        cls.index = index
 								        cls.schema = schema
 								        cls.default_args = [title, body]
 								        cls.title = title
 								        cls.body = body
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								    def test_simple_search(self):
 								        query_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
+								        query = query_parser.parse_query("sea whale")
 								        top_docs = tantivy.TopDocs(10)
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								        result = self.searcher.search(query, top_docs)
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
+								        print(result)
 								        assert len(result) == 1
 								        _, doc_address = result[0]
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
+								        searched_doc = self.searcher.doc(doc_address)
 								        assert searched_doc.get_first(self.title) == "The Old Man and the Sea"
-												Initial python bindings implementation.

											
										
										
											2019-06-04 09:09:58 +00:00
 								    def test_doc(self):
 								        builder = tantivy.SchemaBuilder()
 								        title = builder.add_text_field("title", stored=True)
 								        doc = tantivy.Document()
 								        assert doc.is_empty
 								        doc.add_text(title, "The Old Man and the Sea")
 								        assert doc.get_first(title) == "The Old Man and the Sea"
 								        assert doc.len == 1
 								        assert not doc.is_empty
-												Tests

added tests for AND BooleanQuery and ValueError in case of a malformed query

Moved document index setup into a classmethod.
classmethod will setup once at the start,
giving all test methods attributes to access.

I think it cuts on boilerplate in each test method

											
										
										
											2019-08-02 21:32:08 +00:00
 								    def test_and_query(self):
 								        q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
 								        # look for an intersection of documents
 								        query = q_parser.parse_query("title:men AND body:summer")
 								        top_docs = tantivy.TopDocs(10)
 								        result = self.searcher.search(query, top_docs)
 								        print(result)
 								        # summer isn't present
 								        assert len(result) == 0
 								        query = q_parser.parse_query("title:men AND body:winter")
 								        result = self.searcher.search(query, top_docs)
 								        assert len(result) == 1
 								    def test_query_errors(self):
 								        q_parser = tantivy.QueryParser.for_index(self.index, self.default_args)
 								        # no "bod" field
 								        with pytest.raises(ValueError):
 								            q_parser.parse_query("bod:title")
-												Test reading an index from dir

Created and committed an index dir with 3 documents

											
										
										
											2019-08-02 23:39:02 +00:00
 								class TestFromDiskClass(object):
 								    @classmethod
 								    def setup_class(cls):
 								        builder = tantivy.SchemaBuilder()
 								        title = builder.add_text_field("title", stored=True)
 								        body = builder.add_text_field("body")
 								        cls.default_args = [title, body]
 								        schema = builder.build()
 								        cls.schema = schema
 								        cls.index = tantivy.Index(schema)
-												use meta.json to programmatically check last opstamp and num_docs

											
										
										
											2019-08-02 23:55:31 +00:00
+								        cls.path_to_index = "tests/test_index/"
-												Test reading an index from dir

Created and committed an index dir with 3 documents

											
										
										
											2019-08-02 23:39:02 +00:00
 								    def test_exists(self):
 								        # prefer to keep it separate in case anyone deletes this
 								        # runs from the root directory
-												use meta.json to programmatically check last opstamp and num_docs

											
										
										
											2019-08-02 23:55:31 +00:00
+								        assert self.index.exists(self.path_to_index)
-												Test reading an index from dir

Created and committed an index dir with 3 documents

											
										
										
											2019-08-02 23:39:02 +00:00
 								    def test_opens_from_dir(self):
-												use meta.json to programmatically check last opstamp and num_docs

											
										
										
											2019-08-02 23:55:31 +00:00
+								        tantivy.Index(self.schema, self.path_to_index)
-												Test reading an index from dir

Created and committed an index dir with 3 documents

											
										
										
											2019-08-02 23:39:02 +00:00
 								    def test_create_readers(self):
-												use meta.json to programmatically check last opstamp and num_docs

											
										
										
											2019-08-02 23:55:31 +00:00
+								        idx = tantivy.Index(self.schema, self.path_to_index)
-												Test reading an index from dir

Created and committed an index dir with 3 documents

											
										
										
											2019-08-02 23:39:02 +00:00
+								        reload_policy = "OnCommit"  # or "Manual"
 								        assert idx.reader(reload_policy, 4)
 								        assert idx.reader("Manual", 4)
 								    def test_create_writer_and_reader(self):
-												use meta.json to programmatically check last opstamp and num_docs

											
										
										
											2019-08-02 23:55:31 +00:00
+								        idx = tantivy.Index(self.schema, self.path_to_index)
-												Test reading an index from dir

Created and committed an index dir with 3 documents

											
										
										
											2019-08-02 23:39:02 +00:00
+								        writer = idx.writer()
 								        reload_policy = "OnCommit"  # or "Manual"
 								        reader = idx.reader(reload_policy, 4)
 								        # check against the opstamp in the meta file
-												use meta.json to programmatically check last opstamp and num_docs

											
										
										
											2019-08-02 23:55:31 +00:00
+								        meta_fname = "meta.json"
 								        with open("{}{}".format(self.path_to_index, meta_fname)) as f:
 								            json_file = json.load(f)
 								            expected_last_opstamp = json_file["opstamp"]
 								            # ASSUMPTION
 								            # We haven't had any deletes in the index
 								            # so max_doc per index coincides with the value of `num_docs`
 								            # summing them in all segments, gives the number of documents
 								            expected_num_docs = sum([segment["max_doc"]
 								                                     for segment in json_file["segments"]])
-												Test reading an index from dir

Created and committed an index dir with 3 documents

											
										
										
											2019-08-02 23:39:02 +00:00
+								        assert writer.commit_opstamp == expected_last_opstamp
 								        q_parser = tantivy.QueryParser.for_index(idx, self.default_args)
 								        # get all documents
 								        query = q_parser.parse_query("*")
 								        top_docs = tantivy.TopDocs(10)
 								        docs = reader.searcher().search(query, top_docs)
 								        for (_score, doc_addr) in docs:
 								            print(reader.searcher().doc(doc_addr))
-												use meta.json to programmatically check last opstamp and num_docs

											
										
										
											2019-08-02 23:55:31 +00:00
+								        assert expected_num_docs == len(docs)