diff --git a/docs/reference.md b/docs/reference.md index 8ca4294..78249a8 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -36,3 +36,28 @@ best_doc = searcher.doc(best_doc_address) Note: for integer search, the integer field should be indexed. For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html) + +## Escape quotes inside a query string + +The tantivy docs for the query parser say that special characters like quotes can be +escaped inside query values. However, it will also be necessary to surround +the search query in additional quotes, as if a phrase query were being used. + +The following will NOT work: + +```python +# Raises ValueError +index.parse_query(r'sea\"', ["title", "body"]) +``` + +However, the following will succeed: + +```python +# Works! +index.parse_query(r'"sea\""', ["title", "body"]) +``` + +Note that whether the included (and escaped) quote actually gets used +to match documents depends on the tokenizer used for the field. For example, +the default tokenizer will not match the document "sea\"s" with the query +"sea\"", because this tokenizer discards punctuation. diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..af64cb3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,191 @@ +import pytest + +from tantivy import SchemaBuilder, Index, Document + + +def schema(): + return ( + SchemaBuilder() + .add_text_field("title", stored=True) + .add_text_field("body") + .build() + ) + + +def schema_numeric_fields(): + return ( + SchemaBuilder() + .add_integer_field("id", stored=True, indexed=True) + .add_float_field("rating", stored=True, indexed=True) + .add_boolean_field("is_good", stored=True, indexed=True) + .add_text_field("body", stored=True) + .build() + ) + + +def create_index(dir=None): + # assume all tests will use the same documents for now + # other methods may set up function-local indexes + index = Index(schema(), dir) + writer = index.writer(15_000_000, 1) + + # 2 ways of adding documents + # 1 + doc = Document() + # create a document instance + # add field-value pairs + doc.add_text("title", "The Old Man and the Sea") + doc.add_text( + "body", + ( + "He was an old man who fished alone in a skiff in" + "the Gulf Stream and he had gone eighty-four days " + "now without taking a fish." + ), + ) + writer.add_document(doc) + # 2 use the built-in json support + # keys need to coincide with field names + doc = Document.from_dict( + { + "title": "Of Mice and Men", + "body": ( + "A few miles south of Soledad, the Salinas River drops " + "in close to the hillside bank and runs deep and " + "green. The water is warm too, for it has slipped " + "twinkling over the yellow sands in the sunlight " + "before reaching the narrow pool. On one side of the " + "river the golden foothill slopes curve up to the " + "strong and rocky Gabilan Mountains, but on the valley " + "side the water is lined with trees—willows fresh and " + "green with every spring, carrying in their lower leaf " + "junctures the debris of the winter’s flooding; and " + "sycamores with mottled, white, recumbent limbs and " + "branches that arch over the pool" + ), + } + ) + writer.add_document(doc) + writer.add_json( + """{ + "title": ["Frankenstein", "The Modern Prometheus"], + "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." + }""" + ) + writer.commit() + index.reload() + return index + + +def create_index_with_numeric_fields(dir=None): + index = Index(schema_numeric_fields(), dir) + writer = index.writer(15_000_000, 1) + + doc = Document() + doc.add_integer("id", 1) + doc.add_float("rating", 3.5) + doc.add_boolean("is_good", True) + doc.add_text( + "body", + ( + "He was an old man who fished alone in a skiff in" + "the Gulf Stream and he had gone eighty-four days " + "now without taking a fish." + ), + ) + writer.add_document(doc) + doc = Document.from_dict( + { + "id": 2, + "rating": 4.5, + "is_good": False, + "body": ( + "A few miles south of Soledad, the Salinas River drops " + "in close to the hillside bank and runs deep and " + "green. The water is warm too, for it has slipped " + "twinkling over the yellow sands in the sunlight " + "before reaching the narrow pool. On one side of the " + "river the golden foothill slopes curve up to the " + "strong and rocky Gabilan Mountains, but on the valley " + "side the water is lined with trees—willows fresh and " + "green with every spring, carrying in their lower leaf " + "junctures the debris of the winter’s flooding; and " + "sycamores with mottled, white, recumbent limbs and " + "branches that arch over the pool" + ), + }, + ) + writer.add_document(doc) + writer.commit() + index.reload() + return index + + +def spanish_schema(): + return ( + SchemaBuilder() + .add_text_field("title", stored=True, tokenizer_name="es_stem") + .add_text_field("body", tokenizer_name="es_stem") + .build() + ) + + +def create_spanish_index(): + # assume all tests will use the same documents for now + # other methods may set up function-local indexes + index = Index(spanish_schema(), None) + writer = index.writer() + + # 2 ways of adding documents + # 1 + doc = Document() + # create a document instance + # add field-value pairs + doc.add_text("title", "El viejo y el mar") + doc.add_text( + "body", + ( + "Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. " + ), + ) + writer.add_document(doc) + # 2 use the built-in json support + # keys need to coincide with field names + doc = Document.from_dict( + { + "title": "De ratones y hombres", + "body": ( + "Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque" + ), + } + ) + writer.add_document(doc) + writer.add_json( + """{ + "title": ["Frankenstein", "El moderno Prometeo"], + "body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa." + }""" + ) + writer.commit() + index.reload() + return index + + +@pytest.fixture() +def dir_index(tmpdir): + return (tmpdir, create_index(str(tmpdir))) + + +@pytest.fixture(scope="class") +def ram_index(): + return create_index() + + +@pytest.fixture(scope="class") +def ram_index_numeric_fields(): + return create_index_with_numeric_fields() + + +@pytest.fixture(scope="class") +def spanish_index(): + return create_spanish_index() diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index 80c1719..cabe977 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -3,201 +3,14 @@ from io import BytesIO import copy import datetime import json -import tantivy import pickle import pytest + import tantivy +from conftest import schema, schema_numeric_fields from tantivy import Document, Index, SchemaBuilder, SnippetGenerator, Query -def schema(): - return ( - SchemaBuilder() - .add_text_field("title", stored=True) - .add_text_field("body") - .build() - ) - - -def schema_numeric_fields(): - return ( - SchemaBuilder() - .add_integer_field("id", stored=True, indexed=True) - .add_float_field("rating", stored=True, indexed=True) - .add_boolean_field("is_good", stored=True, indexed=True) - .add_text_field("body", stored=True) - .build() - ) - - -def create_index(dir=None): - # assume all tests will use the same documents for now - # other methods may set up function-local indexes - index = Index(schema(), dir) - writer = index.writer(15_000_000, 1) - - # 2 ways of adding documents - # 1 - doc = Document() - # create a document instance - # add field-value pairs - doc.add_text("title", "The Old Man and the Sea") - doc.add_text( - "body", - ( - "He was an old man who fished alone in a skiff in" - "the Gulf Stream and he had gone eighty-four days " - "now without taking a fish." - ), - ) - writer.add_document(doc) - # 2 use the built-in json support - # keys need to coincide with field names - doc = Document.from_dict( - { - "title": "Of Mice and Men", - "body": ( - "A few miles south of Soledad, the Salinas River drops " - "in close to the hillside bank and runs deep and " - "green. The water is warm too, for it has slipped " - "twinkling over the yellow sands in the sunlight " - "before reaching the narrow pool. On one side of the " - "river the golden foothill slopes curve up to the " - "strong and rocky Gabilan Mountains, but on the valley " - "side the water is lined with trees—willows fresh and " - "green with every spring, carrying in their lower leaf " - "junctures the debris of the winter’s flooding; and " - "sycamores with mottled, white, recumbent limbs and " - "branches that arch over the pool" - ), - } - ) - writer.add_document(doc) - writer.add_json( - """{ - "title": ["Frankenstein", "The Modern Prometheus"], - "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." - }""" - ) - writer.commit() - index.reload() - return index - - -def create_index_with_numeric_fields(dir=None): - index = Index(schema_numeric_fields(), dir) - writer = index.writer(15_000_000, 1) - - doc = Document() - doc.add_integer("id", 1) - doc.add_float("rating", 3.5) - doc.add_boolean("is_good", True) - doc.add_text( - "body", - ( - "He was an old man who fished alone in a skiff in" - "the Gulf Stream and he had gone eighty-four days " - "now without taking a fish." - ), - ) - writer.add_document(doc) - doc = Document.from_dict( - { - "id": 2, - "rating": 4.5, - "is_good": False, - "body": ( - "A few miles south of Soledad, the Salinas River drops " - "in close to the hillside bank and runs deep and " - "green. The water is warm too, for it has slipped " - "twinkling over the yellow sands in the sunlight " - "before reaching the narrow pool. On one side of the " - "river the golden foothill slopes curve up to the " - "strong and rocky Gabilan Mountains, but on the valley " - "side the water is lined with trees—willows fresh and " - "green with every spring, carrying in their lower leaf " - "junctures the debris of the winter’s flooding; and " - "sycamores with mottled, white, recumbent limbs and " - "branches that arch over the pool" - ), - }, - ) - writer.add_document(doc) - writer.commit() - index.reload() - return index - - -def spanish_schema(): - return ( - SchemaBuilder() - .add_text_field("title", stored=True, tokenizer_name="es_stem") - .add_text_field("body", tokenizer_name="es_stem") - .build() - ) - - -def create_spanish_index(): - # assume all tests will use the same documents for now - # other methods may set up function-local indexes - index = Index(spanish_schema(), None) - writer = index.writer() - - # 2 ways of adding documents - # 1 - doc = Document() - # create a document instance - # add field-value pairs - doc.add_text("title", "El viejo y el mar") - doc.add_text( - "body", - ( - "Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. " - ), - ) - writer.add_document(doc) - # 2 use the built-in json support - # keys need to coincide with field names - doc = Document.from_dict( - { - "title": "De ratones y hombres", - "body": ( - "Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque" - ), - } - ) - writer.add_document(doc) - writer.add_json( - """{ - "title": ["Frankenstein", "El moderno Prometeo"], - "body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa." - }""" - ) - writer.commit() - index.reload() - return index - - -@pytest.fixture() -def dir_index(tmpdir): - return (tmpdir, create_index(str(tmpdir))) - - -@pytest.fixture(scope="class") -def ram_index(): - return create_index() - - -@pytest.fixture(scope="class") -def ram_index_numeric_fields(): - return create_index_with_numeric_fields() - - -@pytest.fixture(scope="class") -def spanish_index(): - return create_spanish_index() - - class TestClass(object): def test_simple_search_in_dir(self, dir_index): _, index = dir_index diff --git a/tests/test_escapes.py b/tests/test_escapes.py new file mode 100644 index 0000000..91f5d25 --- /dev/null +++ b/tests/test_escapes.py @@ -0,0 +1,32 @@ +import pytest + +from tantivy import Query + + +def test_escape_quote_parse_query(ram_index): + index = ram_index + with pytest.raises(ValueError) as ex: + # This test to show that surrounding quotes are required to allow + # a single to be escaped the quote, otherwise it will raise `ValueError`. + _ = index.parse_query(r'sea\"', ["title", "body"]) + + +def test_escape_quote_parse_query_with_quotes(ram_index): + index = ram_index + # We verify only that `parse_query` doesn't raise. We are not testing + # whether tantivy's `parse_query` is correct. + query = index.parse_query(r'"sea\""', ["title", "body"]) + + +def test_escape_quote_parse_query_quoted(ram_index): + index = ram_index + # We verify only that `parse_query` doesn't raise. We are not testing + # whether tantivy's `parse_query` is correct. + query = index.parse_query(r'title:"sea \"whale"') + + +def test_escape_quote_term_query(ram_index): + index = ram_index + # We verify only that `parse_query` doesn't raise. We are not testing + # whether tantivy's `parse_query` is correct. + query = Query.term_query(index.schema, "title", "sea\" whale")