doc: escaping quotes requires quoted queries (fix #185) (#189)

master
Caleb Hattingh 2024-01-21 21:16:47 +01:00 committed by GitHub
parent cde36c20cd
commit eba0d55634
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 250 additions and 189 deletions

View File

@ -36,3 +36,28 @@ best_doc = searcher.doc(best_doc_address)
Note: for integer search, the integer field should be indexed. Note: for integer search, the integer field should be indexed.
For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html) For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
## Escape quotes inside a query string
The tantivy docs for the query parser say that special characters like quotes can be
escaped inside query values. However, it will also be necessary to surround
the search query in additional quotes, as if a phrase query were being used.
The following will NOT work:
```python
# Raises ValueError
index.parse_query(r'sea\"', ["title", "body"])
```
However, the following will succeed:
```python
# Works!
index.parse_query(r'"sea\""', ["title", "body"])
```
Note that whether the included (and escaped) quote actually gets used
to match documents depends on the tokenizer used for the field. For example,
the default tokenizer will not match the document "sea\"s" with the query
"sea\"", because this tokenizer discards punctuation.

191
tests/conftest.py Normal file
View File

@ -0,0 +1,191 @@
import pytest
from tantivy import SchemaBuilder, Index, Document
def schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True)
.add_text_field("body")
.build()
)
def schema_numeric_fields():
return (
SchemaBuilder()
.add_integer_field("id", stored=True, indexed=True)
.add_float_field("rating", stored=True, indexed=True)
.add_boolean_field("is_good", stored=True, indexed=True)
.add_text_field("body", stored=True)
.build()
)
def create_index(dir=None):
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(schema(), dir)
writer = index.writer(15_000_000, 1)
# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "The Old Man and the Sea")
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "Of Mice and Men",
"body": (
"A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "The Modern Prometheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"""
)
writer.commit()
index.reload()
return index
def create_index_with_numeric_fields(dir=None):
index = Index(schema_numeric_fields(), dir)
writer = index.writer(15_000_000, 1)
doc = Document()
doc.add_integer("id", 1)
doc.add_float("rating", 3.5)
doc.add_boolean("is_good", True)
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)
doc = Document.from_dict(
{
"id": 2,
"rating": 4.5,
"is_good": False,
"body": (
"A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
},
)
writer.add_document(doc)
writer.commit()
index.reload()
return index
def spanish_schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True, tokenizer_name="es_stem")
.add_text_field("body", tokenizer_name="es_stem")
.build()
)
def create_spanish_index():
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(spanish_schema(), None)
writer = index.writer()
# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "El viejo y el mar")
doc.add_text(
"body",
(
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "De ratones y hombres",
"body": (
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "El moderno Prometeo"],
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
}"""
)
writer.commit()
index.reload()
return index
@pytest.fixture()
def dir_index(tmpdir):
return (tmpdir, create_index(str(tmpdir)))
@pytest.fixture(scope="class")
def ram_index():
return create_index()
@pytest.fixture(scope="class")
def ram_index_numeric_fields():
return create_index_with_numeric_fields()
@pytest.fixture(scope="class")
def spanish_index():
return create_spanish_index()

View File

@ -3,201 +3,14 @@ from io import BytesIO
import copy import copy
import datetime import datetime
import json import json
import tantivy
import pickle import pickle
import pytest import pytest
import tantivy import tantivy
from conftest import schema, schema_numeric_fields
from tantivy import Document, Index, SchemaBuilder, SnippetGenerator, Query from tantivy import Document, Index, SchemaBuilder, SnippetGenerator, Query
def schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True)
.add_text_field("body")
.build()
)
def schema_numeric_fields():
return (
SchemaBuilder()
.add_integer_field("id", stored=True, indexed=True)
.add_float_field("rating", stored=True, indexed=True)
.add_boolean_field("is_good", stored=True, indexed=True)
.add_text_field("body", stored=True)
.build()
)
def create_index(dir=None):
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(schema(), dir)
writer = index.writer(15_000_000, 1)
# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "The Old Man and the Sea")
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "Of Mice and Men",
"body": (
"A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "The Modern Prometheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"""
)
writer.commit()
index.reload()
return index
def create_index_with_numeric_fields(dir=None):
index = Index(schema_numeric_fields(), dir)
writer = index.writer(15_000_000, 1)
doc = Document()
doc.add_integer("id", 1)
doc.add_float("rating", 3.5)
doc.add_boolean("is_good", True)
doc.add_text(
"body",
(
"He was an old man who fished alone in a skiff in"
"the Gulf Stream and he had gone eighty-four days "
"now without taking a fish."
),
)
writer.add_document(doc)
doc = Document.from_dict(
{
"id": 2,
"rating": 4.5,
"is_good": False,
"body": (
"A few miles south of Soledad, the Salinas River drops "
"in close to the hillside bank and runs deep and "
"green. The water is warm too, for it has slipped "
"twinkling over the yellow sands in the sunlight "
"before reaching the narrow pool. On one side of the "
"river the golden foothill slopes curve up to the "
"strong and rocky Gabilan Mountains, but on the valley "
"side the water is lined with trees—willows fresh and "
"green with every spring, carrying in their lower leaf "
"junctures the debris of the winters flooding; and "
"sycamores with mottled, white, recumbent limbs and "
"branches that arch over the pool"
),
},
)
writer.add_document(doc)
writer.commit()
index.reload()
return index
def spanish_schema():
return (
SchemaBuilder()
.add_text_field("title", stored=True, tokenizer_name="es_stem")
.add_text_field("body", tokenizer_name="es_stem")
.build()
)
def create_spanish_index():
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index(spanish_schema(), None)
writer = index.writer()
# 2 ways of adding documents
# 1
doc = Document()
# create a document instance
# add field-value pairs
doc.add_text("title", "El viejo y el mar")
doc.add_text(
"body",
(
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
),
)
writer.add_document(doc)
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document.from_dict(
{
"title": "De ratones y hombres",
"body": (
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
),
}
)
writer.add_document(doc)
writer.add_json(
"""{
"title": ["Frankenstein", "El moderno Prometeo"],
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
}"""
)
writer.commit()
index.reload()
return index
@pytest.fixture()
def dir_index(tmpdir):
return (tmpdir, create_index(str(tmpdir)))
@pytest.fixture(scope="class")
def ram_index():
return create_index()
@pytest.fixture(scope="class")
def ram_index_numeric_fields():
return create_index_with_numeric_fields()
@pytest.fixture(scope="class")
def spanish_index():
return create_spanish_index()
class TestClass(object): class TestClass(object):
def test_simple_search_in_dir(self, dir_index): def test_simple_search_in_dir(self, dir_index):
_, index = dir_index _, index = dir_index

32
tests/test_escapes.py Normal file
View File

@ -0,0 +1,32 @@
import pytest
from tantivy import Query
def test_escape_quote_parse_query(ram_index):
index = ram_index
with pytest.raises(ValueError) as ex:
# This test to show that surrounding quotes are required to allow
# a single to be escaped the quote, otherwise it will raise `ValueError`.
_ = index.parse_query(r'sea\"', ["title", "body"])
def test_escape_quote_parse_query_with_quotes(ram_index):
index = ram_index
# We verify only that `parse_query` doesn't raise. We are not testing
# whether tantivy's `parse_query` is correct.
query = index.parse_query(r'"sea\""', ["title", "body"])
def test_escape_quote_parse_query_quoted(ram_index):
index = ram_index
# We verify only that `parse_query` doesn't raise. We are not testing
# whether tantivy's `parse_query` is correct.
query = index.parse_query(r'title:"sea \"whale"')
def test_escape_quote_term_query(ram_index):
index = ram_index
# We verify only that `parse_query` doesn't raise. We are not testing
# whether tantivy's `parse_query` is correct.
query = Query.term_query(index.schema, "title", "sea\" whale")