parent
cde36c20cd
commit
eba0d55634
|
@ -36,3 +36,28 @@ best_doc = searcher.doc(best_doc_address)
|
||||||
Note: for integer search, the integer field should be indexed.
|
Note: for integer search, the integer field should be indexed.
|
||||||
|
|
||||||
For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
|
For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
|
||||||
|
|
||||||
|
## Escape quotes inside a query string
|
||||||
|
|
||||||
|
The tantivy docs for the query parser say that special characters like quotes can be
|
||||||
|
escaped inside query values. However, it will also be necessary to surround
|
||||||
|
the search query in additional quotes, as if a phrase query were being used.
|
||||||
|
|
||||||
|
The following will NOT work:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Raises ValueError
|
||||||
|
index.parse_query(r'sea\"', ["title", "body"])
|
||||||
|
```
|
||||||
|
|
||||||
|
However, the following will succeed:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Works!
|
||||||
|
index.parse_query(r'"sea\""', ["title", "body"])
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that whether the included (and escaped) quote actually gets used
|
||||||
|
to match documents depends on the tokenizer used for the field. For example,
|
||||||
|
the default tokenizer will not match the document "sea\"s" with the query
|
||||||
|
"sea\"", because this tokenizer discards punctuation.
|
||||||
|
|
|
@ -0,0 +1,191 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tantivy import SchemaBuilder, Index, Document
|
||||||
|
|
||||||
|
|
||||||
|
def schema():
|
||||||
|
return (
|
||||||
|
SchemaBuilder()
|
||||||
|
.add_text_field("title", stored=True)
|
||||||
|
.add_text_field("body")
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def schema_numeric_fields():
|
||||||
|
return (
|
||||||
|
SchemaBuilder()
|
||||||
|
.add_integer_field("id", stored=True, indexed=True)
|
||||||
|
.add_float_field("rating", stored=True, indexed=True)
|
||||||
|
.add_boolean_field("is_good", stored=True, indexed=True)
|
||||||
|
.add_text_field("body", stored=True)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_index(dir=None):
|
||||||
|
# assume all tests will use the same documents for now
|
||||||
|
# other methods may set up function-local indexes
|
||||||
|
index = Index(schema(), dir)
|
||||||
|
writer = index.writer(15_000_000, 1)
|
||||||
|
|
||||||
|
# 2 ways of adding documents
|
||||||
|
# 1
|
||||||
|
doc = Document()
|
||||||
|
# create a document instance
|
||||||
|
# add field-value pairs
|
||||||
|
doc.add_text("title", "The Old Man and the Sea")
|
||||||
|
doc.add_text(
|
||||||
|
"body",
|
||||||
|
(
|
||||||
|
"He was an old man who fished alone in a skiff in"
|
||||||
|
"the Gulf Stream and he had gone eighty-four days "
|
||||||
|
"now without taking a fish."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
# 2 use the built-in json support
|
||||||
|
# keys need to coincide with field names
|
||||||
|
doc = Document.from_dict(
|
||||||
|
{
|
||||||
|
"title": "Of Mice and Men",
|
||||||
|
"body": (
|
||||||
|
"A few miles south of Soledad, the Salinas River drops "
|
||||||
|
"in close to the hillside bank and runs deep and "
|
||||||
|
"green. The water is warm too, for it has slipped "
|
||||||
|
"twinkling over the yellow sands in the sunlight "
|
||||||
|
"before reaching the narrow pool. On one side of the "
|
||||||
|
"river the golden foothill slopes curve up to the "
|
||||||
|
"strong and rocky Gabilan Mountains, but on the valley "
|
||||||
|
"side the water is lined with trees—willows fresh and "
|
||||||
|
"green with every spring, carrying in their lower leaf "
|
||||||
|
"junctures the debris of the winter’s flooding; and "
|
||||||
|
"sycamores with mottled, white, recumbent limbs and "
|
||||||
|
"branches that arch over the pool"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
writer.add_json(
|
||||||
|
"""{
|
||||||
|
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||||
|
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
||||||
|
}"""
|
||||||
|
)
|
||||||
|
writer.commit()
|
||||||
|
index.reload()
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def create_index_with_numeric_fields(dir=None):
|
||||||
|
index = Index(schema_numeric_fields(), dir)
|
||||||
|
writer = index.writer(15_000_000, 1)
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
doc.add_integer("id", 1)
|
||||||
|
doc.add_float("rating", 3.5)
|
||||||
|
doc.add_boolean("is_good", True)
|
||||||
|
doc.add_text(
|
||||||
|
"body",
|
||||||
|
(
|
||||||
|
"He was an old man who fished alone in a skiff in"
|
||||||
|
"the Gulf Stream and he had gone eighty-four days "
|
||||||
|
"now without taking a fish."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
doc = Document.from_dict(
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"rating": 4.5,
|
||||||
|
"is_good": False,
|
||||||
|
"body": (
|
||||||
|
"A few miles south of Soledad, the Salinas River drops "
|
||||||
|
"in close to the hillside bank and runs deep and "
|
||||||
|
"green. The water is warm too, for it has slipped "
|
||||||
|
"twinkling over the yellow sands in the sunlight "
|
||||||
|
"before reaching the narrow pool. On one side of the "
|
||||||
|
"river the golden foothill slopes curve up to the "
|
||||||
|
"strong and rocky Gabilan Mountains, but on the valley "
|
||||||
|
"side the water is lined with trees—willows fresh and "
|
||||||
|
"green with every spring, carrying in their lower leaf "
|
||||||
|
"junctures the debris of the winter’s flooding; and "
|
||||||
|
"sycamores with mottled, white, recumbent limbs and "
|
||||||
|
"branches that arch over the pool"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
writer.commit()
|
||||||
|
index.reload()
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def spanish_schema():
|
||||||
|
return (
|
||||||
|
SchemaBuilder()
|
||||||
|
.add_text_field("title", stored=True, tokenizer_name="es_stem")
|
||||||
|
.add_text_field("body", tokenizer_name="es_stem")
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_spanish_index():
|
||||||
|
# assume all tests will use the same documents for now
|
||||||
|
# other methods may set up function-local indexes
|
||||||
|
index = Index(spanish_schema(), None)
|
||||||
|
writer = index.writer()
|
||||||
|
|
||||||
|
# 2 ways of adding documents
|
||||||
|
# 1
|
||||||
|
doc = Document()
|
||||||
|
# create a document instance
|
||||||
|
# add field-value pairs
|
||||||
|
doc.add_text("title", "El viejo y el mar")
|
||||||
|
doc.add_text(
|
||||||
|
"body",
|
||||||
|
(
|
||||||
|
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
|
||||||
|
),
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
# 2 use the built-in json support
|
||||||
|
# keys need to coincide with field names
|
||||||
|
doc = Document.from_dict(
|
||||||
|
{
|
||||||
|
"title": "De ratones y hombres",
|
||||||
|
"body": (
|
||||||
|
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
writer.add_document(doc)
|
||||||
|
writer.add_json(
|
||||||
|
"""{
|
||||||
|
"title": ["Frankenstein", "El moderno Prometeo"],
|
||||||
|
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
|
||||||
|
}"""
|
||||||
|
)
|
||||||
|
writer.commit()
|
||||||
|
index.reload()
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def dir_index(tmpdir):
|
||||||
|
return (tmpdir, create_index(str(tmpdir)))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ram_index():
|
||||||
|
return create_index()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ram_index_numeric_fields():
|
||||||
|
return create_index_with_numeric_fields()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def spanish_index():
|
||||||
|
return create_spanish_index()
|
|
@ -3,201 +3,14 @@ from io import BytesIO
|
||||||
import copy
|
import copy
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import tantivy
|
|
||||||
import pickle
|
import pickle
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import tantivy
|
import tantivy
|
||||||
|
from conftest import schema, schema_numeric_fields
|
||||||
from tantivy import Document, Index, SchemaBuilder, SnippetGenerator, Query
|
from tantivy import Document, Index, SchemaBuilder, SnippetGenerator, Query
|
||||||
|
|
||||||
|
|
||||||
def schema():
|
|
||||||
return (
|
|
||||||
SchemaBuilder()
|
|
||||||
.add_text_field("title", stored=True)
|
|
||||||
.add_text_field("body")
|
|
||||||
.build()
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def schema_numeric_fields():
|
|
||||||
return (
|
|
||||||
SchemaBuilder()
|
|
||||||
.add_integer_field("id", stored=True, indexed=True)
|
|
||||||
.add_float_field("rating", stored=True, indexed=True)
|
|
||||||
.add_boolean_field("is_good", stored=True, indexed=True)
|
|
||||||
.add_text_field("body", stored=True)
|
|
||||||
.build()
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def create_index(dir=None):
|
|
||||||
# assume all tests will use the same documents for now
|
|
||||||
# other methods may set up function-local indexes
|
|
||||||
index = Index(schema(), dir)
|
|
||||||
writer = index.writer(15_000_000, 1)
|
|
||||||
|
|
||||||
# 2 ways of adding documents
|
|
||||||
# 1
|
|
||||||
doc = Document()
|
|
||||||
# create a document instance
|
|
||||||
# add field-value pairs
|
|
||||||
doc.add_text("title", "The Old Man and the Sea")
|
|
||||||
doc.add_text(
|
|
||||||
"body",
|
|
||||||
(
|
|
||||||
"He was an old man who fished alone in a skiff in"
|
|
||||||
"the Gulf Stream and he had gone eighty-four days "
|
|
||||||
"now without taking a fish."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
writer.add_document(doc)
|
|
||||||
# 2 use the built-in json support
|
|
||||||
# keys need to coincide with field names
|
|
||||||
doc = Document.from_dict(
|
|
||||||
{
|
|
||||||
"title": "Of Mice and Men",
|
|
||||||
"body": (
|
|
||||||
"A few miles south of Soledad, the Salinas River drops "
|
|
||||||
"in close to the hillside bank and runs deep and "
|
|
||||||
"green. The water is warm too, for it has slipped "
|
|
||||||
"twinkling over the yellow sands in the sunlight "
|
|
||||||
"before reaching the narrow pool. On one side of the "
|
|
||||||
"river the golden foothill slopes curve up to the "
|
|
||||||
"strong and rocky Gabilan Mountains, but on the valley "
|
|
||||||
"side the water is lined with trees—willows fresh and "
|
|
||||||
"green with every spring, carrying in their lower leaf "
|
|
||||||
"junctures the debris of the winter’s flooding; and "
|
|
||||||
"sycamores with mottled, white, recumbent limbs and "
|
|
||||||
"branches that arch over the pool"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
writer.add_document(doc)
|
|
||||||
writer.add_json(
|
|
||||||
"""{
|
|
||||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
|
||||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
|
||||||
}"""
|
|
||||||
)
|
|
||||||
writer.commit()
|
|
||||||
index.reload()
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def create_index_with_numeric_fields(dir=None):
|
|
||||||
index = Index(schema_numeric_fields(), dir)
|
|
||||||
writer = index.writer(15_000_000, 1)
|
|
||||||
|
|
||||||
doc = Document()
|
|
||||||
doc.add_integer("id", 1)
|
|
||||||
doc.add_float("rating", 3.5)
|
|
||||||
doc.add_boolean("is_good", True)
|
|
||||||
doc.add_text(
|
|
||||||
"body",
|
|
||||||
(
|
|
||||||
"He was an old man who fished alone in a skiff in"
|
|
||||||
"the Gulf Stream and he had gone eighty-four days "
|
|
||||||
"now without taking a fish."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
writer.add_document(doc)
|
|
||||||
doc = Document.from_dict(
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"rating": 4.5,
|
|
||||||
"is_good": False,
|
|
||||||
"body": (
|
|
||||||
"A few miles south of Soledad, the Salinas River drops "
|
|
||||||
"in close to the hillside bank and runs deep and "
|
|
||||||
"green. The water is warm too, for it has slipped "
|
|
||||||
"twinkling over the yellow sands in the sunlight "
|
|
||||||
"before reaching the narrow pool. On one side of the "
|
|
||||||
"river the golden foothill slopes curve up to the "
|
|
||||||
"strong and rocky Gabilan Mountains, but on the valley "
|
|
||||||
"side the water is lined with trees—willows fresh and "
|
|
||||||
"green with every spring, carrying in their lower leaf "
|
|
||||||
"junctures the debris of the winter’s flooding; and "
|
|
||||||
"sycamores with mottled, white, recumbent limbs and "
|
|
||||||
"branches that arch over the pool"
|
|
||||||
),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
writer.add_document(doc)
|
|
||||||
writer.commit()
|
|
||||||
index.reload()
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def spanish_schema():
|
|
||||||
return (
|
|
||||||
SchemaBuilder()
|
|
||||||
.add_text_field("title", stored=True, tokenizer_name="es_stem")
|
|
||||||
.add_text_field("body", tokenizer_name="es_stem")
|
|
||||||
.build()
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def create_spanish_index():
|
|
||||||
# assume all tests will use the same documents for now
|
|
||||||
# other methods may set up function-local indexes
|
|
||||||
index = Index(spanish_schema(), None)
|
|
||||||
writer = index.writer()
|
|
||||||
|
|
||||||
# 2 ways of adding documents
|
|
||||||
# 1
|
|
||||||
doc = Document()
|
|
||||||
# create a document instance
|
|
||||||
# add field-value pairs
|
|
||||||
doc.add_text("title", "El viejo y el mar")
|
|
||||||
doc.add_text(
|
|
||||||
"body",
|
|
||||||
(
|
|
||||||
"Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
|
|
||||||
),
|
|
||||||
)
|
|
||||||
writer.add_document(doc)
|
|
||||||
# 2 use the built-in json support
|
|
||||||
# keys need to coincide with field names
|
|
||||||
doc = Document.from_dict(
|
|
||||||
{
|
|
||||||
"title": "De ratones y hombres",
|
|
||||||
"body": (
|
|
||||||
"Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
writer.add_document(doc)
|
|
||||||
writer.add_json(
|
|
||||||
"""{
|
|
||||||
"title": ["Frankenstein", "El moderno Prometeo"],
|
|
||||||
"body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
|
|
||||||
}"""
|
|
||||||
)
|
|
||||||
writer.commit()
|
|
||||||
index.reload()
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def dir_index(tmpdir):
|
|
||||||
return (tmpdir, create_index(str(tmpdir)))
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
|
||||||
def ram_index():
|
|
||||||
return create_index()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
|
||||||
def ram_index_numeric_fields():
|
|
||||||
return create_index_with_numeric_fields()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
|
||||||
def spanish_index():
|
|
||||||
return create_spanish_index()
|
|
||||||
|
|
||||||
|
|
||||||
class TestClass(object):
|
class TestClass(object):
|
||||||
def test_simple_search_in_dir(self, dir_index):
|
def test_simple_search_in_dir(self, dir_index):
|
||||||
_, index = dir_index
|
_, index = dir_index
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tantivy import Query
|
||||||
|
|
||||||
|
|
||||||
|
def test_escape_quote_parse_query(ram_index):
|
||||||
|
index = ram_index
|
||||||
|
with pytest.raises(ValueError) as ex:
|
||||||
|
# This test to show that surrounding quotes are required to allow
|
||||||
|
# a single to be escaped the quote, otherwise it will raise `ValueError`.
|
||||||
|
_ = index.parse_query(r'sea\"', ["title", "body"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_escape_quote_parse_query_with_quotes(ram_index):
|
||||||
|
index = ram_index
|
||||||
|
# We verify only that `parse_query` doesn't raise. We are not testing
|
||||||
|
# whether tantivy's `parse_query` is correct.
|
||||||
|
query = index.parse_query(r'"sea\""', ["title", "body"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_escape_quote_parse_query_quoted(ram_index):
|
||||||
|
index = ram_index
|
||||||
|
# We verify only that `parse_query` doesn't raise. We are not testing
|
||||||
|
# whether tantivy's `parse_query` is correct.
|
||||||
|
query = index.parse_query(r'title:"sea \"whale"')
|
||||||
|
|
||||||
|
|
||||||
|
def test_escape_quote_term_query(ram_index):
|
||||||
|
index = ram_index
|
||||||
|
# We verify only that `parse_query` doesn't raise. We are not testing
|
||||||
|
# whether tantivy's `parse_query` is correct.
|
||||||
|
query = Query.term_query(index.schema, "title", "sea\" whale")
|
Loading…
Reference in New Issue