parent
cde36c20cd
commit
eba0d55634
|
@ -36,3 +36,28 @@ best_doc = searcher.doc(best_doc_address)
|
|||
Note: for integer search, the integer field should be indexed.
|
||||
|
||||
For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
|
||||
|
||||
## Escape quotes inside a query string
|
||||
|
||||
The tantivy docs for the query parser say that special characters like quotes can be
|
||||
escaped inside query values. However, it will also be necessary to surround
|
||||
the search query in additional quotes, as if a phrase query were being used.
|
||||
|
||||
The following will NOT work:
|
||||
|
||||
```python
|
||||
# Raises ValueError
|
||||
index.parse_query(r'sea\"', ["title", "body"])
|
||||
```
|
||||
|
||||
However, the following will succeed:
|
||||
|
||||
```python
|
||||
# Works!
|
||||
index.parse_query(r'"sea\""', ["title", "body"])
|
||||
```
|
||||
|
||||
Note that whether the included (and escaped) quote actually gets used
|
||||
to match documents depends on the tokenizer used for the field. For example,
|
||||
the default tokenizer will not match the document "sea\"s" with the query
|
||||
"sea\"", because this tokenizer discards punctuation.
|
||||
|
|
|
@ -0,0 +1,191 @@
|
|||
import pytest
|
||||
|
||||
from tantivy import SchemaBuilder, Index, Document
|
||||
|
||||
|
||||
def schema():
|
||||
return (
|
||||
SchemaBuilder()
|
||||
.add_text_field("title", stored=True)
|
||||
.add_text_field("body")
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def schema_numeric_fields():
|
||||
return (
|
||||
SchemaBuilder()
|
||||
.add_integer_field("id", stored=True, indexed=True)
|
||||
.add_float_field("rating", stored=True, indexed=True)
|
||||
.add_boolean_field("is_good", stored=True, indexed=True)
|
||||
.add_text_field("body", stored=True)
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def create_index(dir=None):
|
||||
# assume all tests will use the same documents for now
|
||||
# other methods may set up function-local indexes
|
||||
index = Index(schema(), dir)
|
||||
writer = index.writer(15_000_000, 1)
|
||||
|
||||
# 2 ways of adding documents
|
||||
# 1
|
||||
doc = Document()
|
||||
# create a document instance
|
||||
# add field-value pairs
|
||||
doc.add_text("title", "The Old Man and the Sea")
|
||||
doc.add_text(
|
||||
"body",
|
||||
(
|
||||
"He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."
|
||||
),
|
||||
)
|
||||
writer.add_document(doc)
|
||||
# 2 use the built-in json support
|
||||
# keys need to coincide with field names
|
||||
doc = Document.from_dict(
|
||||
{
|
||||
"title": "Of Mice and Men",
|
||||
"body": (
|
||||
"A few miles south of Soledad, the Salinas River drops "
|
||||
"in close to the hillside bank and runs deep and "
|
||||
"green. The water is warm too, for it has slipped "
|
||||
"twinkling over the yellow sands in the sunlight "
|
||||
"before reaching the narrow pool. On one side of the "
|
||||
"river the golden foothill slopes curve up to the "
|
||||
"strong and rocky Gabilan Mountains, but on the valley "
|
||||
"side the water is lined with trees—willows fresh and "
|
||||
"green with every spring, carrying in their lower leaf "
|
||||
"junctures the debris of the winter’s flooding; and "
|
||||
"sycamores with mottled, white, recumbent limbs and "
|
||||
"branches that arch over the pool"
|
||||
),
|
||||
}
|
||||
)
|
||||
writer.add_document(doc)
|
||||
writer.add_json(
|
||||
"""{
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
||||
}"""
|
||||
)
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
def create_index_with_numeric_fields(dir=None):
|
||||
index = Index(schema_numeric_fields(), dir)
|
||||
writer = index.writer(15_000_000, 1)
|
||||
|
||||
doc = Document()
|
||||
doc.add_integer("id", 1)
|
||||
doc.add_float("rating", 3.5)
|
||||
doc.add_boolean("is_good", True)
|
||||
doc.add_text(
|
||||
"body",
|
||||
(
|
||||
"He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."
|
||||
),
|
||||
)
|
||||
writer.add_document(doc)
|
||||
doc = Document.from_dict(
|
||||
{
|
||||
"id": 2,
|
||||
"rating": 4.5,
|
||||
"is_good": False,
|
||||
"body": (
|
||||
"A few miles south of Soledad, the Salinas River drops "
|
||||
"in close to the hillside bank and runs deep and "
|
||||
"green. The water is warm too, for it has slipped "
|
||||
"twinkling over the yellow sands in the sunlight "
|
||||
"before reaching the narrow pool. On one side of the "
|
||||
"river the golden foothill slopes curve up to the "
|
||||
"strong and rocky Gabilan Mountains, but on the valley "
|
||||
"side the water is lined with trees—willows fresh and "
|
||||
"green with every spring, carrying in their lower leaf "
|
||||
"junctures the debris of the winter’s flooding; and "
|
||||
"sycamores with mottled, white, recumbent limbs and "
|
||||
"branches that arch over the pool"
|
||||
),
|
||||
},
|
||||
)
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
def spanish_schema():
|
||||
return (
|
||||
SchemaBuilder()
|
||||
.add_text_field("title", stored=True, tokenizer_name="es_stem")
|
||||
.add_text_field("body", tokenizer_name="es_stem")
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def create_spanish_index():
|
||||
# assume all tests will use the same documents for now
|
||||
# other methods may set up function-local indexes
|
||||
index = Index(spanish_schema(), None)
|
||||
writer = index.writer()
|
||||
|
||||
# 2 ways of adding documents
|
||||
# 1
|
||||
doc = Document()
|
||||
# create a document instance
|
||||
# add field-value pairs
|
||||
doc.add_text("title", "El viejo y el mar")
|
||||
doc.add_text(
|
||||
"body",
|
||||
(
|
||||
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
|
||||
),
|
||||
)
|
||||
writer.add_document(doc)
|
||||
# 2 use the built-in json support
|
||||
# keys need to coincide with field names
|
||||
doc = Document.from_dict(
|
||||
{
|
||||
"title": "De ratones y hombres",
|
||||
"body": (
|
||||
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
|
||||
),
|
||||
}
|
||||
)
|
||||
writer.add_document(doc)
|
||||
writer.add_json(
|
||||
"""{
|
||||
"title": ["Frankenstein", "El moderno Prometeo"],
|
||||
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
|
||||
}"""
|
||||
)
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dir_index(tmpdir):
|
||||
return (tmpdir, create_index(str(tmpdir)))
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def ram_index():
|
||||
return create_index()
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def ram_index_numeric_fields():
|
||||
return create_index_with_numeric_fields()
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def spanish_index():
|
||||
return create_spanish_index()
|
|
@ -3,201 +3,14 @@ from io import BytesIO
|
|||
import copy
|
||||
import datetime
|
||||
import json
|
||||
import tantivy
|
||||
import pickle
|
||||
import pytest
|
||||
|
||||
import tantivy
|
||||
from conftest import schema, schema_numeric_fields
|
||||
from tantivy import Document, Index, SchemaBuilder, SnippetGenerator, Query
|
||||
|
||||
|
||||
def schema():
|
||||
return (
|
||||
SchemaBuilder()
|
||||
.add_text_field("title", stored=True)
|
||||
.add_text_field("body")
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def schema_numeric_fields():
|
||||
return (
|
||||
SchemaBuilder()
|
||||
.add_integer_field("id", stored=True, indexed=True)
|
||||
.add_float_field("rating", stored=True, indexed=True)
|
||||
.add_boolean_field("is_good", stored=True, indexed=True)
|
||||
.add_text_field("body", stored=True)
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def create_index(dir=None):
|
||||
# assume all tests will use the same documents for now
|
||||
# other methods may set up function-local indexes
|
||||
index = Index(schema(), dir)
|
||||
writer = index.writer(15_000_000, 1)
|
||||
|
||||
# 2 ways of adding documents
|
||||
# 1
|
||||
doc = Document()
|
||||
# create a document instance
|
||||
# add field-value pairs
|
||||
doc.add_text("title", "The Old Man and the Sea")
|
||||
doc.add_text(
|
||||
"body",
|
||||
(
|
||||
"He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."
|
||||
),
|
||||
)
|
||||
writer.add_document(doc)
|
||||
# 2 use the built-in json support
|
||||
# keys need to coincide with field names
|
||||
doc = Document.from_dict(
|
||||
{
|
||||
"title": "Of Mice and Men",
|
||||
"body": (
|
||||
"A few miles south of Soledad, the Salinas River drops "
|
||||
"in close to the hillside bank and runs deep and "
|
||||
"green. The water is warm too, for it has slipped "
|
||||
"twinkling over the yellow sands in the sunlight "
|
||||
"before reaching the narrow pool. On one side of the "
|
||||
"river the golden foothill slopes curve up to the "
|
||||
"strong and rocky Gabilan Mountains, but on the valley "
|
||||
"side the water is lined with trees—willows fresh and "
|
||||
"green with every spring, carrying in their lower leaf "
|
||||
"junctures the debris of the winter’s flooding; and "
|
||||
"sycamores with mottled, white, recumbent limbs and "
|
||||
"branches that arch over the pool"
|
||||
),
|
||||
}
|
||||
)
|
||||
writer.add_document(doc)
|
||||
writer.add_json(
|
||||
"""{
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
||||
}"""
|
||||
)
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
def create_index_with_numeric_fields(dir=None):
|
||||
index = Index(schema_numeric_fields(), dir)
|
||||
writer = index.writer(15_000_000, 1)
|
||||
|
||||
doc = Document()
|
||||
doc.add_integer("id", 1)
|
||||
doc.add_float("rating", 3.5)
|
||||
doc.add_boolean("is_good", True)
|
||||
doc.add_text(
|
||||
"body",
|
||||
(
|
||||
"He was an old man who fished alone in a skiff in"
|
||||
"the Gulf Stream and he had gone eighty-four days "
|
||||
"now without taking a fish."
|
||||
),
|
||||
)
|
||||
writer.add_document(doc)
|
||||
doc = Document.from_dict(
|
||||
{
|
||||
"id": 2,
|
||||
"rating": 4.5,
|
||||
"is_good": False,
|
||||
"body": (
|
||||
"A few miles south of Soledad, the Salinas River drops "
|
||||
"in close to the hillside bank and runs deep and "
|
||||
"green. The water is warm too, for it has slipped "
|
||||
"twinkling over the yellow sands in the sunlight "
|
||||
"before reaching the narrow pool. On one side of the "
|
||||
"river the golden foothill slopes curve up to the "
|
||||
"strong and rocky Gabilan Mountains, but on the valley "
|
||||
"side the water is lined with trees—willows fresh and "
|
||||
"green with every spring, carrying in their lower leaf "
|
||||
"junctures the debris of the winter’s flooding; and "
|
||||
"sycamores with mottled, white, recumbent limbs and "
|
||||
"branches that arch over the pool"
|
||||
),
|
||||
},
|
||||
)
|
||||
writer.add_document(doc)
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
def spanish_schema():
|
||||
return (
|
||||
SchemaBuilder()
|
||||
.add_text_field("title", stored=True, tokenizer_name="es_stem")
|
||||
.add_text_field("body", tokenizer_name="es_stem")
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
def create_spanish_index():
|
||||
# assume all tests will use the same documents for now
|
||||
# other methods may set up function-local indexes
|
||||
index = Index(spanish_schema(), None)
|
||||
writer = index.writer()
|
||||
|
||||
# 2 ways of adding documents
|
||||
# 1
|
||||
doc = Document()
|
||||
# create a document instance
|
||||
# add field-value pairs
|
||||
doc.add_text("title", "El viejo y el mar")
|
||||
doc.add_text(
|
||||
"body",
|
||||
(
|
||||
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
|
||||
),
|
||||
)
|
||||
writer.add_document(doc)
|
||||
# 2 use the built-in json support
|
||||
# keys need to coincide with field names
|
||||
doc = Document.from_dict(
|
||||
{
|
||||
"title": "De ratones y hombres",
|
||||
"body": (
|
||||
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
|
||||
),
|
||||
}
|
||||
)
|
||||
writer.add_document(doc)
|
||||
writer.add_json(
|
||||
"""{
|
||||
"title": ["Frankenstein", "El moderno Prometeo"],
|
||||
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
|
||||
}"""
|
||||
)
|
||||
writer.commit()
|
||||
index.reload()
|
||||
return index
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dir_index(tmpdir):
|
||||
return (tmpdir, create_index(str(tmpdir)))
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def ram_index():
|
||||
return create_index()
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def ram_index_numeric_fields():
|
||||
return create_index_with_numeric_fields()
|
||||
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def spanish_index():
|
||||
return create_spanish_index()
|
||||
|
||||
|
||||
class TestClass(object):
|
||||
def test_simple_search_in_dir(self, dir_index):
|
||||
_, index = dir_index
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
import pytest
|
||||
|
||||
from tantivy import Query
|
||||
|
||||
|
||||
def test_escape_quote_parse_query(ram_index):
|
||||
index = ram_index
|
||||
with pytest.raises(ValueError) as ex:
|
||||
# This test to show that surrounding quotes are required to allow
|
||||
# a single to be escaped the quote, otherwise it will raise `ValueError`.
|
||||
_ = index.parse_query(r'sea\"', ["title", "body"])
|
||||
|
||||
|
||||
def test_escape_quote_parse_query_with_quotes(ram_index):
|
||||
index = ram_index
|
||||
# We verify only that `parse_query` doesn't raise. We are not testing
|
||||
# whether tantivy's `parse_query` is correct.
|
||||
query = index.parse_query(r'"sea\""', ["title", "body"])
|
||||
|
||||
|
||||
def test_escape_quote_parse_query_quoted(ram_index):
|
||||
index = ram_index
|
||||
# We verify only that `parse_query` doesn't raise. We are not testing
|
||||
# whether tantivy's `parse_query` is correct.
|
||||
query = index.parse_query(r'title:"sea \"whale"')
|
||||
|
||||
|
||||
def test_escape_quote_term_query(ram_index):
|
||||
index = ram_index
|
||||
# We verify only that `parse_query` doesn't raise. We are not testing
|
||||
# whether tantivy's `parse_query` is correct.
|
||||
query = Query.term_query(index.schema, "title", "sea\" whale")
|
Loading…
Reference in New Issue