Expose Tantivy's PhraseQuery (#234)

master
Tomoko Uchida 2024-05-04 05:10:58 +09:00 committed by GitHub
parent 3d394959b8
commit 03b1c89fa3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 77 additions and 9 deletions

View File

@ -157,6 +157,48 @@ impl Query {
})
}
/// Construct a Tantivy's PhraseQuery with custom offsets and slop
///
/// # Arguments
///
/// * `schema` - Schema of the target index.
/// * `field_name` - Field name to be searched.
/// * `words` - Word list that constructs the phrase. A word can be a term text or a pair of term text and its offset in the phrase.
/// * `slop` - (Optional) The number of gaps permitted between the words in the query phrase. Default is 0.
#[staticmethod]
#[pyo3(signature = (schema, field_name, words, slop = 0))]
pub(crate) fn phrase_query(
schema: &Schema,
field_name: &str,
words: Vec<&PyAny>,
slop: u32,
) -> PyResult<Query> {
let mut terms_with_offset = Vec::with_capacity(words.len());
for (idx, word) in words.into_iter().enumerate() {
if let Ok((offset, value)) = word.extract() {
// Custom offset is provided.
let term = make_term(&schema.inner, field_name, value)?;
terms_with_offset.push((offset, term));
} else {
// Custom offset is not provided. Use the list index as the offset.
let term = make_term(&schema.inner, field_name, word)?;
terms_with_offset.push((idx, term));
};
}
if terms_with_offset.is_empty() {
return Err(exceptions::PyValueError::new_err(
"words must not be empty.",
));
}
let inner = tv::query::PhraseQuery::new_with_offset_and_slop(
terms_with_offset,
slop,
);
Ok(Query {
inner: Box::new(inner),
})
}
/// Construct a Tantivy's BooleanQuery
#[staticmethod]
#[pyo3(signature = (subqueries))]

View File

@ -1,6 +1,6 @@
import datetime
from enum import Enum
from typing import Any, Optional, Sequence
from typing import Any, Optional, Sequence, Union
class Schema:
pass
@ -206,16 +206,10 @@ class Query:
pass
@staticmethod
def fuzzy_term_query(
schema: Schema,
field_name: str,
text: str,
distance: int = 1,
transposition_cost_one: bool = True,
prefix=False,
) -> Query:
def phrase_query(schema: Schema, field_name: str, words: list[Union[str, tuple[int, str]]], slop: int = 0) -> Query:
pass
@staticmethod
def boolean_query(subqueries: Sequence[tuple[Occur, Query]]) -> Query:
pass

View File

@ -801,6 +801,38 @@ class TestQuery(object):
result = index.searcher().search(query, 10)
assert len(result.hits) == 3
def test_phrase_query(self, ram_index):
index = ram_index
searcher = index.searcher()
query = Query.phrase_query(index.schema, "title", ["old", "man"])
# should match the title "The Old Man and the Sea"
result = searcher.search(query, 10)
assert len(result.hits) == 1
query = Query.phrase_query(index.schema, "title", ["man", "old"])
# sholdn't match any document
result = searcher.search(query, 10)
assert len(result.hits) == 0
query = Query.phrase_query(index.schema, "title", [(1, "man"), (0, "old")])
# should match "The Old Man and the Sea" with the given offsets
result = searcher.search(query, 10)
assert len(result.hits) == 1
query = Query.phrase_query(index.schema, "title", ["man", "sea"])
# sholdn't match any document with default slop 0.
result = searcher.search(query, 10)
assert len(result.hits) == 0
query = Query.phrase_query(index.schema, "title", ["man", "sea"], slop=2)
# should match the title "The Old Man and the Sea" with slop 2.
result = searcher.search(query, 10)
assert len(result.hits) == 1
with pytest.raises(ValueError, match = "words must not be empty."):
Query.phrase_query(index.schema, "title", [])
def test_fuzzy_term_query(self, ram_index):
index = ram_index
query = Query.fuzzy_term_query(index.schema, "title", "ice")