doc: escaping quotes requires quoted queries (fix #185) (#189)

2024-01-21 21:16:47 +01:00 · 2024-01-21 21:16:47 +01:00 · eba0d55634
commit eba0d55634
parent cde36c20cd
4 changed files with 250 additions and 189 deletions
--- a/docs/reference.md
+++ b/docs/reference.md
@ -36,3 +36,28 @@ best_doc = searcher.doc(best_doc_address)
 Note: for integer search, the integer field should be indexed.
 For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
 ## Escape quotes inside a query string
 The tantivy docs for the query parser say that special characters like quotes can be 
 escaped inside query values. However, it will also be necessary to surround
 the search query in additional quotes, as if a phrase query were being used.
 The following will NOT work:
 ```python
 # Raises ValueError
 index.parse_query(r'sea\"', ["title", "body"])
 ```
 However, the following will succeed:
 ```python
 # Works!
 index.parse_query(r'"sea\""', ["title", "body"])
 ```
 Note that whether the included (and escaped) quote actually gets used
 to match documents depends on the tokenizer used for the field. For example,
 the default tokenizer will not match the document "sea\"s" with the query
 "sea\"", because this tokenizer discards punctuation. 
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,191 @@
 import pytest
 from tantivy import SchemaBuilder, Index, Document
 def schema():
    return (
        SchemaBuilder()
        .add_text_field("title", stored=True)
        .add_text_field("body")
        .build()
    )
 def schema_numeric_fields():
    return (
        SchemaBuilder()
        .add_integer_field("id", stored=True, indexed=True)
        .add_float_field("rating", stored=True, indexed=True)
        .add_boolean_field("is_good", stored=True, indexed=True)
        .add_text_field("body", stored=True)
        .build()
    )
 def create_index(dir=None):
    # assume all tests will use the same documents for now
    # other methods may set up function-local indexes
    index = Index(schema(), dir)
    writer = index.writer(15_000_000, 1)
    # 2 ways of adding documents
    # 1
    doc = Document()
    # create a document instance
    # add field-value pairs
    doc.add_text("title", "The Old Man and the Sea")
    doc.add_text(
        "body",
        (
            "He was an old man who fished alone in a skiff in"
            "the Gulf Stream and he had gone eighty-four days "
            "now without taking a fish."
        ),
    )
    writer.add_document(doc)
    # 2 use the built-in json support
    # keys need to coincide with field names
    doc = Document.from_dict(
        {
            "title": "Of Mice and Men",
            "body": (
                "A few miles south of Soledad, the Salinas River drops "
                "in close to the hillside bank and runs deep and "
                "green. The water is warm too, for it has slipped "
                "twinkling over the yellow sands in the sunlight "
                "before reaching the narrow pool. On one side of the "
                "river the golden foothill slopes curve up to the "
                "strong and rocky Gabilan Mountains, but on the valley "
                "side the water is lined with trees—willows fresh and "
                "green with every spring, carrying in their lower leaf "
                "junctures the debris of the winter’s flooding; and "
                "sycamores with mottled, white, recumbent limbs and "
                "branches that arch over the pool"
            ),
        }
    )
    writer.add_document(doc)
    writer.add_json(
        """{
            "title": ["Frankenstein", "The Modern Prometheus"],
            "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings.  I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
        }"""
    )
    writer.commit()
    index.reload()
    return index
 def create_index_with_numeric_fields(dir=None):
    index = Index(schema_numeric_fields(), dir)
    writer = index.writer(15_000_000, 1)
    doc = Document()
    doc.add_integer("id", 1)
    doc.add_float("rating", 3.5)
    doc.add_boolean("is_good", True)
    doc.add_text(
        "body",
        (
            "He was an old man who fished alone in a skiff in"
            "the Gulf Stream and he had gone eighty-four days "
            "now without taking a fish."
        ),
    )
    writer.add_document(doc)
    doc = Document.from_dict(
        {
            "id": 2,
            "rating": 4.5,
            "is_good": False,
            "body": (
                "A few miles south of Soledad, the Salinas River drops "
                "in close to the hillside bank and runs deep and "
                "green. The water is warm too, for it has slipped "
                "twinkling over the yellow sands in the sunlight "
                "before reaching the narrow pool. On one side of the "
                "river the golden foothill slopes curve up to the "
                "strong and rocky Gabilan Mountains, but on the valley "
                "side the water is lined with trees—willows fresh and "
                "green with every spring, carrying in their lower leaf "
                "junctures the debris of the winter’s flooding; and "
                "sycamores with mottled, white, recumbent limbs and "
                "branches that arch over the pool"
            ),
        },
    )
    writer.add_document(doc)
    writer.commit()
    index.reload()
    return index
 def spanish_schema():
    return (
        SchemaBuilder()
        .add_text_field("title", stored=True, tokenizer_name="es_stem")
        .add_text_field("body", tokenizer_name="es_stem")
        .build()
    )
 def create_spanish_index():
    # assume all tests will use the same documents for now
    # other methods may set up function-local indexes
    index = Index(spanish_schema(), None)
    writer = index.writer()
    # 2 ways of adding documents
    # 1
    doc = Document()
    # create a document instance
    # add field-value pairs
    doc.add_text("title", "El viejo y el mar")
    doc.add_text(
        "body",
        (
            "Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
        ),
    )
    writer.add_document(doc)
    # 2 use the built-in json support
    # keys need to coincide with field names
    doc = Document.from_dict(
        {
            "title": "De ratones y hombres",
            "body": (
                "Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
            ),
        }
    )
    writer.add_document(doc)
    writer.add_json(
        """{
            "title": ["Frankenstein", "El moderno Prometeo"],
            "body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
        }"""
    )
    writer.commit()
    index.reload()
    return index
@pytest.fixture()
 def dir_index(tmpdir):
    return (tmpdir, create_index(str(tmpdir)))
@pytest.fixture(scope="class")
 def ram_index():
    return create_index()
@pytest.fixture(scope="class")
 def ram_index_numeric_fields():
    return create_index_with_numeric_fields()
@pytest.fixture(scope="class")
 def spanish_index():
    return create_spanish_index()
--- a/tests/tantivy_test.py
+++ b/tests/tantivy_test.py
@ -3,201 +3,14 @@ from io import BytesIO
 import copy
 import datetime
 import json
 import tantivy
 import pickle
 import pytest
 import tantivy
 from conftest import schema, schema_numeric_fields
 from tantivy import Document, Index, SchemaBuilder, SnippetGenerator, Query
 def schema():
    return (
        SchemaBuilder()
        .add_text_field("title", stored=True)
        .add_text_field("body")
        .build()
    )
 def schema_numeric_fields():
    return (
        SchemaBuilder()
        .add_integer_field("id", stored=True, indexed=True)
        .add_float_field("rating", stored=True, indexed=True)
        .add_boolean_field("is_good", stored=True, indexed=True)
        .add_text_field("body", stored=True)
        .build()
    )
 def create_index(dir=None):
    # assume all tests will use the same documents for now
    # other methods may set up function-local indexes
    index = Index(schema(), dir)
    writer = index.writer(15_000_000, 1)
    # 2 ways of adding documents
    # 1
    doc = Document()
    # create a document instance
    # add field-value pairs
    doc.add_text("title", "The Old Man and the Sea")
    doc.add_text(
        "body",
        (
            "He was an old man who fished alone in a skiff in"
            "the Gulf Stream and he had gone eighty-four days "
            "now without taking a fish."
        ),
    )
    writer.add_document(doc)
    # 2 use the built-in json support
    # keys need to coincide with field names
    doc = Document.from_dict(
        {
            "title": "Of Mice and Men",
            "body": (
                "A few miles south of Soledad, the Salinas River drops "
                "in close to the hillside bank and runs deep and "
                "green. The water is warm too, for it has slipped "
                "twinkling over the yellow sands in the sunlight "
                "before reaching the narrow pool. On one side of the "
                "river the golden foothill slopes curve up to the "
                "strong and rocky Gabilan Mountains, but on the valley "
                "side the water is lined with trees—willows fresh and "
                "green with every spring, carrying in their lower leaf "
                "junctures the debris of the winter’s flooding; and "
                "sycamores with mottled, white, recumbent limbs and "
                "branches that arch over the pool"
            ),
        }
    )
    writer.add_document(doc)
    writer.add_json(
        """{
            "title": ["Frankenstein", "The Modern Prometheus"],
            "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings.  I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
        }"""
    )
    writer.commit()
    index.reload()
    return index
 def create_index_with_numeric_fields(dir=None):
    index = Index(schema_numeric_fields(), dir)
    writer = index.writer(15_000_000, 1)
    doc = Document()
    doc.add_integer("id", 1)
    doc.add_float("rating", 3.5)
    doc.add_boolean("is_good", True)
    doc.add_text(
        "body",
        (
            "He was an old man who fished alone in a skiff in"
            "the Gulf Stream and he had gone eighty-four days "
            "now without taking a fish."
        ),
    )
    writer.add_document(doc)
    doc = Document.from_dict(
        {
            "id": 2,
            "rating": 4.5,
            "is_good": False,
            "body": (
                "A few miles south of Soledad, the Salinas River drops "
                "in close to the hillside bank and runs deep and "
                "green. The water is warm too, for it has slipped "
                "twinkling over the yellow sands in the sunlight "
                "before reaching the narrow pool. On one side of the "
                "river the golden foothill slopes curve up to the "
                "strong and rocky Gabilan Mountains, but on the valley "
                "side the water is lined with trees—willows fresh and "
                "green with every spring, carrying in their lower leaf "
                "junctures the debris of the winter’s flooding; and "
                "sycamores with mottled, white, recumbent limbs and "
                "branches that arch over the pool"
            ),
        },
    )
    writer.add_document(doc)
    writer.commit()
    index.reload()
    return index
 def spanish_schema():
    return (
        SchemaBuilder()
        .add_text_field("title", stored=True, tokenizer_name="es_stem")
        .add_text_field("body", tokenizer_name="es_stem")
        .build()
    )
 def create_spanish_index():
    # assume all tests will use the same documents for now
    # other methods may set up function-local indexes
    index = Index(spanish_schema(), None)
    writer = index.writer()
    # 2 ways of adding documents
    # 1
    doc = Document()
    # create a document instance
    # add field-value pairs
    doc.add_text("title", "El viejo y el mar")
    doc.add_text(
        "body",
        (
            "Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
        ),
    )
    writer.add_document(doc)
    # 2 use the built-in json support
    # keys need to coincide with field names
    doc = Document.from_dict(
        {
            "title": "De ratones y hombres",
            "body": (
                "Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
            ),
        }
    )
    writer.add_document(doc)
    writer.add_json(
        """{
            "title": ["Frankenstein", "El moderno Prometeo"],
            "body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
        }"""
    )
    writer.commit()
    index.reload()
    return index
@pytest.fixture()
 def dir_index(tmpdir):
    return (tmpdir, create_index(str(tmpdir)))
@pytest.fixture(scope="class")
 def ram_index():
    return create_index()
@pytest.fixture(scope="class")
 def ram_index_numeric_fields():
    return create_index_with_numeric_fields()
@pytest.fixture(scope="class")
 def spanish_index():
    return create_spanish_index()
 class TestClass(object):
    def test_simple_search_in_dir(self, dir_index):
        _, index = dir_index
--- a/tests/test_escapes.py
+++ b/tests/test_escapes.py
@ -0,0 +1,32 @@
 import pytest
 from tantivy import Query
 def test_escape_quote_parse_query(ram_index):
    index = ram_index
    with pytest.raises(ValueError) as ex:
        # This test to show that surrounding quotes are required to allow
        # a single to be escaped the quote, otherwise it will raise `ValueError`.
        _ = index.parse_query(r'sea\"', ["title", "body"])
 def test_escape_quote_parse_query_with_quotes(ram_index):
    index = ram_index
    # We verify only that `parse_query` doesn't raise. We are not testing
    # whether tantivy's `parse_query` is correct.
    query = index.parse_query(r'"sea\""', ["title", "body"])
 def test_escape_quote_parse_query_quoted(ram_index):
    index = ram_index
    # We verify only that `parse_query` doesn't raise. We are not testing
    # whether tantivy's `parse_query` is correct.
    query = index.parse_query(r'title:"sea \"whale"')
 def test_escape_quote_term_query(ram_index):
    index = ram_index
    # We verify only that `parse_query` doesn't raise. We are not testing
    # whether tantivy's `parse_query` is correct.
    query = Query.term_query(index.schema, "title", "sea\" whale")